aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatthew Lemon <lemon@matthewlemon.com>2019-09-27 15:12:57 +0100
committerMatthew Lemon <lemon@matthewlemon.com>2019-09-27 15:12:57 +0100
commitd4272b28da76c2d1fe110d611ef08a445f11ceb2 (patch)
treeaf01f431bd39504bb2d9234c7c6a41879b95051f
initial commit
Diffstat (limited to '')
-rw-r--r--.gitignore1
-rw-r--r--Pipfile18
-rw-r--r--Pipfile.lock159
-rw-r--r--datamaps/__init__.py1
-rw-r--r--datamaps/api/__init__.py5
-rw-r--r--datamaps/api/api.py14
-rw-r--r--datamaps/core/__init__.py3
-rw-r--r--datamaps/core/master.py208
-rw-r--r--datamaps/core/row.py84
-rw-r--r--datamaps/core/temporal.py121
-rw-r--r--datamaps/main.py131
-rw-r--r--datamaps/process/__init__.py1
-rw-r--r--datamaps/process/cell.py42
-rw-r--r--datamaps/process/cleansers.py287
-rw-r--r--datamaps/tests/__init__.py0
-rw-r--r--datamaps/tests/conftest.py9
-rw-r--r--datamaps/tests/resources/.~lock.master.xlsx#1
-rw-r--r--datamaps/tests/resources/master.xlsxbin0 -> 5889 bytes
-rw-r--r--datamaps/tests/test_api.py10
-rw-r--r--datamaps/tests/test_cleanser.py41
-rw-r--r--datamaps/tests/test_quarter.py52
-rw-r--r--datamaps/utils.py564
-rw-r--r--setup.py66
23 files changed, 1818 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9f11b75
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.idea/
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..7b1b4b4
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,18 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pytest = "*"
+
+[packages]
+openpyxl = "*"
+bcompiler-engine = {editable = true,path = "/home/lemon/code/python/bcompiler-engine"}
+colorlog = "*"
+python-dateutil = "*"
+click = "*"
+datamaps = {editable = true,path = "."}
+
+[requires]
+python_version = "3.8"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..101c7c4
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,159 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "98952e013f654efa60ba726c96572096e1bdaa9f45d92d0bb65d9faf905bffa2"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": "3.8"
+ },
+ "sources": [
+ {
+ "name": "pypi",
+ "url": "https://pypi.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "appdirs": {
+ "hashes": [
+ "sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92",
+ "sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e"
+ ],
+ "version": "==1.4.3"
+ },
+ "bcompiler-engine": {
+ "editable": true,
+ "path": "/home/lemon/code/python/bcompiler-engine"
+ },
+ "click": {
+ "hashes": [
+ "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
+ "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
+ ],
+ "index": "pypi",
+ "version": "==7.0"
+ },
+ "colorlog": {
+ "hashes": [
+ "sha256:3cf31b25cbc8f86ec01fef582ef3b840950dea414084ed19ab922c8b493f9b42",
+ "sha256:450f52ea2a2b6ebb308f034ea9a9b15cea51e65650593dca1da3eb792e4e4981"
+ ],
+ "index": "pypi",
+ "version": "==4.0.2"
+ },
+ "datamaps": {
+ "editable": true,
+ "path": "."
+ },
+ "et-xmlfile": {
+ "hashes": [
+ "sha256:614d9722d572f6246302c4491846d2c393c199cfa4edc9af593437691683335b"
+ ],
+ "version": "==1.0.1"
+ },
+ "jdcal": {
+ "hashes": [
+ "sha256:1abf1305fce18b4e8aa248cf8fe0c56ce2032392bc64bbd61b5dff2a19ec8bba",
+ "sha256:472872e096eb8df219c23f2689fc336668bdb43d194094b5cc1707e1640acfc8"
+ ],
+ "version": "==1.4.1"
+ },
+ "openpyxl": {
+ "hashes": [
+ "sha256:340a1ab2069764559b9d58027a43a24db18db0e25deb80f81ecb8ca7ee5253db"
+ ],
+ "index": "pypi",
+ "version": "==3.0.0"
+ },
+ "python-dateutil": {
+ "hashes": [
+ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
+ "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+ ],
+ "index": "pypi",
+ "version": "==2.8.0"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "version": "==1.12.0"
+ }
+ },
+ "develop": {
+ "atomicwrites": {
+ "hashes": [
+ "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+ "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+ ],
+ "version": "==1.3.0"
+ },
+ "attrs": {
+ "hashes": [
+ "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
+ "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+ ],
+ "version": "==19.1.0"
+ },
+ "more-itertools": {
+ "hashes": [
+ "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832",
+ "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4"
+ ],
+ "version": "==7.2.0"
+ },
+ "packaging": {
+ "hashes": [
+ "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47",
+ "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108"
+ ],
+ "version": "==19.2"
+ },
+ "pluggy": {
+ "hashes": [
+ "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6",
+ "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34"
+ ],
+ "version": "==0.13.0"
+ },
+ "py": {
+ "hashes": [
+ "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
+ "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+ ],
+ "version": "==1.8.0"
+ },
+ "pyparsing": {
+ "hashes": [
+ "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80",
+ "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4"
+ ],
+ "version": "==2.4.2"
+ },
+ "pytest": {
+ "hashes": [
+ "sha256:813b99704b22c7d377bbd756ebe56c35252bb710937b46f207100e843440b3c2",
+ "sha256:cc6620b96bc667a0c8d4fa592a8c9c94178a1bd6cc799dbb057dfd9286d31a31"
+ ],
+ "index": "pypi",
+ "version": "==5.1.3"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "version": "==1.12.0"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ }
+ }
+}
diff --git a/datamaps/__init__.py b/datamaps/__init__.py
new file mode 100644
index 0000000..c9bd737
--- /dev/null
+++ b/datamaps/__init__.py
@@ -0,0 +1 @@
+__version__ = "1.0.0b0"
diff --git a/datamaps/api/__init__.py b/datamaps/api/__init__.py
new file mode 100644
index 0000000..bb48fdb
--- /dev/null
+++ b/datamaps/api/__init__.py
@@ -0,0 +1,5 @@
+from .api import project_data_from_master_api as project_data_from_master
+from ..core.master import Master as Master
+from ..core.row import Row as Row
+from ..core.temporal import FinancialYear as FinancialYear
+from ..core.temporal import Quarter as Quarter
diff --git a/datamaps/api/api.py b/datamaps/api/api.py
new file mode 100644
index 0000000..59458e7
--- /dev/null
+++ b/datamaps/api/api.py
@@ -0,0 +1,14 @@
+from ..core import Master, Quarter
+
+
+def project_data_from_master_api(master_file: str, quarter: int, year: int):
+ """Create a Master object directly without the need to explicitly pass
+ a Quarter object.
+
+ Args:
+ master_file (str): the path to a master file
+ quarter (int): an integer representing the financial quarter
+ year (int): an integer representing the year
+ """
+ m = Master(Quarter(quarter, year), master_file)
+ return m
diff --git a/datamaps/core/__init__.py b/datamaps/core/__init__.py
new file mode 100644
index 0000000..305208d
--- /dev/null
+++ b/datamaps/core/__init__.py
@@ -0,0 +1,3 @@
+from .row import Row
+from .temporal import Quarter, FinancialYear
+from .master import Master, ProjectData
diff --git a/datamaps/core/master.py b/datamaps/core/master.py
new file mode 100644
index 0000000..1fbfe90
--- /dev/null
+++ b/datamaps/core/master.py
@@ -0,0 +1,208 @@
+import re
+import datetime
+import logging
+import unicodedata
+from pathlib import Path
+from typing import List, Tuple, Iterable, Optional, Any
+
+from ..utils import project_data_from_master
+from ..process.cleansers import DATE_REGEX_4
+from .temporal import Quarter
+
+from openpyxl import load_workbook
+
+logger = logging.getLogger('bcompiler.utils')
+
+
+class ProjectData:
+ """
+ ProjectData class
+ """
+ def __init__(self, d: dict) -> None:
+ """
+ :py:func:`OrderedDict` is easiest to get from project_data_from_master[x]
+ """
+ self._data = d
+
+ def __len__(self) -> int:
+ return len(self._data)
+
+ def __getitem__(self, item):
+ return self._data[item]
+
+ def key_filter(self, key: str) -> List[Tuple]:
+ """
+ Return a list of (k, v) tuples if k in master key.
+ """
+ data = [item for item in self._data.items() if key in item[0]]
+ if not data:
+ raise KeyError("Sorry, there is no matching data")
+ return (data)
+
+ def pull_keys(self, input_iter: Iterable, flat=False) -> List[Tuple[Any, ...]]:
+ """
+ Returns a list of (key, value) tuples from ProjectData if key matches a
+ key. The order of tuples is based on the order of keys passed in the iterable.
+ """
+ if flat is True:
+ # search and replace troublesome EN DASH character
+ xs = [item for item in self._data.items()
+ for i in input_iter if item[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i]
+ xs = [_convert_str_date_to_object(x) for x in xs]
+ ts = sorted(xs, key=lambda x: input_iter.index(x[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS'))))
+ ts = [item[1] for item in ts]
+ return ts
+ else:
+ xs = [item for item in self._data.items()
+ for i in input_iter if item[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i]
+ xs = [item for item in self._data.items()
+ for i in input_iter if item[0] == i]
+ xs = [_convert_str_date_to_object(x) for x in xs]
+ ts = sorted(xs, key=lambda x: input_iter.index(x[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS'))))
+ return ts
+
+ def __repr__(self):
+ return f"ProjectData() - with data: {id(self._data)}"
+
+
+def _convert_str_date_to_object(d_str: tuple) -> Tuple[str, Optional[datetime.date]]:
+ try:
+ if re.match(DATE_REGEX_4, d_str[1]):
+ try:
+ ds = d_str[1].split('-')
+ return (d_str[0], datetime.date(int(ds[0]), int(ds[1]), int(ds[2])))
+ except TypeError:
+ return d_str
+ else:
+ return d_str
+ except TypeError:
+ return d_str
+
+
+class Master:
+ """A Master object, representing the main central data item in ``bcompiler``.
+
+ Args:
+ quarter (:py:class:`bcompiler.api.Quarter`): creating using ``Quarter(1, 2017)`` for example.
+ path (str): path to the master xlsx file
+
+ A master object is a composition between a :py:class:`bcompiler.api.Quarter` object and an
+ actual master xlsx file on disk.
+
+ You create one, either by creating the Quarter object first, and using that as the first
+ parameter of the ``Master`` constructor, e.g.::
+
+ from bcompiler.api import Quarter
+ from bcompiler.api import Master
+
+ q1 = Quarter(1, 2016)
+ m1 = Master(q1, '/tmp/master_1_2016.xlsx')
+
+ or by doing both in one::
+
+ m1 = Master(Quarter(1, 2016), '/tmp/master_1_2016.xlsx')
+
+ Once you have a ``Master`` object, you can access project data from it, like this::
+
+ project_data = m1['Project Title']
+
+
+ The following *attributes* are available on `m1` once created as such, e.g.::
+
+ data = m1.data
+ quarter = m1.quarter
+ filename = m1.filename
+ ..etc
+ """
+ def __init__(self, quarter: Quarter, path: str) -> None:
+ self._quarter = quarter
+ self.path = path
+ self._data = project_data_from_master(self.path)
+ self._project_titles = [item for item in self.data.keys()]
+ self.year = self._quarter.year
+
+ def __getitem__(self, project_name):
+ return ProjectData(self._data[project_name])
+
+ @property
+ def data(self):
+ """Return all the data contained in the master in a large, nested dictionary.
+
+ The resulting data structure contains a dictionary of :py:class:`colletions.OrderedDict` items whose
+ key is the name of a project::
+
+ "Project Name": OrderedDict("key": "value"
+ ...)
+
+ This object can then be further interrogated, for example to obtain all key/values
+ from a partictular project, by doing::
+
+ d = Master.data
+ project_data = d['PROJECT_NAME']
+
+ """
+ return self._data
+
+ @property
+ def quarter(self):
+ """Returns the ``Quarter`` object associated with the ``Master``.
+
+ Example::
+
+ q1 = m.quarter
+
+ ``q1`` can then be further interrogated as documented in :py:class:`core.temporal.Quarter`.
+
+ """
+
+ return self._quarter
+
+ @property
+ def filename(self):
+ """The filename of the master xlsx file, e.g. ``master_1_2017.xlsx``.
+ """
+ p = Path(self.path)
+ return p.name
+
+ @property
+ def projects(self):
+ """A list of project titles derived from the master xlsx.
+ """
+ return self._project_titles
+
+ def duplicate_keys(self, to_log=None):
+ """Checks for duplicate keys in a master xlsx file.
+
+ Args:
+ to_log (bool): Optional True or False, depending on whether you want to see duplicates reported in a ``WARNING`` log message. This is used mainly for internal purposes within ``bcompiler``.
+
+ Returns:
+ duplicates (set): a set of duplicated keys
+ """
+ wb = load_workbook(self.path)
+ ws = wb.active
+ col_a = next(ws.iter_cols())
+ col_a = [item.value for item in col_a]
+ seen: set = set()
+ uniq = []
+ dups: set = set()
+ for x in col_a:
+ if x not in seen:
+ uniq.append(x)
+ seen.add(x)
+ else:
+ dups.add(x)
+ if to_log and len(dups) > 0:
+ for x in dups:
+ logger.warning(f"{self.path} contains duplicate key: \"{x}\". Masters cannot contain duplicate keys. Rename them.")
+ return True
+ elif to_log and len(dups) == 0:
+ logger.info(f"No duplicate keys in {self.path}")
+ return False
+ elif len(dups) > 0:
+ return dups
+ else:
+ return False
+
+ def __repr__(self):
+ return f"Master({self.path}, {self.quarter.quarter}, {self.quarter.year})"
diff --git a/datamaps/core/row.py b/datamaps/core/row.py
new file mode 100644
index 0000000..470b753
--- /dev/null
+++ b/datamaps/core/row.py
@@ -0,0 +1,84 @@
+import string
+import datetime
+
+from ..process.cell import Cell
+from typing import Iterable
+
+from itertools import chain
+
+
+class Row:
+ """
+ A Row object is populated with an iterable (list or other sequence), bound
+ to an openpyxl worksheet. It is used to populate a row of cells in an output
+ Excel file with the values from the iterable.
+
+ The ``anchor_column`` and ``anchor_row`` parameters represent the coordinates of
+ a cell which form the *leftmost* cell of the row, i.e. to set the row of data
+ to start at the very top left of a sheet, you'd create the ``Row()`` object this::
+
+ r = Row(1, 1, interable)
+ r.bind(ws)
+ """
+
+ def __init__(self, anchor_column: int, anchor_row: int, seq: Iterable):
+ if isinstance(anchor_column, str):
+ if len(anchor_column) == 1:
+ enumerated_alphabet = list(enumerate(string.ascii_uppercase, start=1))
+ col_letter = [x for x in enumerated_alphabet if x[1] == anchor_column][0]
+ self._anchor_column = col_letter[0]
+ self._anchor_row = anchor_row
+ self._cell_map = []
+ elif len(anchor_column) == 2:
+ enumerated_alphabet = list(enumerate(list(chain(
+ string.ascii_uppercase, ["{}{}".format(x[0], x[1]) for x in list(zip(['A'] * 26, string.ascii_uppercase))])), start=1))
+ col_letter = [x for x in enumerated_alphabet if x[1] == anchor_column][0]
+ self._anchor_column = col_letter[0]
+ self._anchor_row = anchor_row
+ self._cell_map = []
+ else:
+ raise ValueError("You can only have a column up to AZ")
+ else:
+ self._anchor_column = anchor_column
+ self._anchor_row = anchor_row
+ self._cell_map = []
+ self._seq = seq
+
+
+ def _basic_bind(self, ws):
+ for x in list(enumerate(self._seq, start=self._anchor_column)):
+ self._ws.cell(row=self._anchor_row, column=x[0], value=x[1])
+
+
+ def _cell_bind(self, ws):
+ self._cell_map = []
+ for x in list(enumerate(self._seq, start=self._anchor_column)):
+ self._cell_map.append(
+ Cell(
+ cell_key="",
+ cell_value=x[1],
+ cell_reference=f"{self._anchor_column}{self._anchor_row}",
+ template_sheet=ws,
+ bg_colour=None,
+ fg_colour=None,
+ number_format=None,
+ verification_list=None,
+ r_idx=self._anchor_row,
+ c_idx=x[0]
+ )
+ )
+ for c in self._cell_map:
+ if not isinstance(c.cell_value, datetime.date) and not None:
+ self._ws.cell(row=c.r_idx, column=c.c_idx, value=c.cell_value).number_format = '0'
+ else:
+ self._ws.cell(row=c.r_idx, column=c.c_idx, value=c.cell_value)
+
+
+
+ def bind(self, worksheet):
+ """Bind the Row to a particular worksheetl, which effectively does the
+ printing of data into cells. Must be done prior to saving the workbook.
+ """
+ self._ws = worksheet
+# self._basic_bind(self._ws)
+ self._cell_bind(self._ws)
diff --git a/datamaps/core/temporal.py b/datamaps/core/temporal.py
new file mode 100644
index 0000000..ddb7004
--- /dev/null
+++ b/datamaps/core/temporal.py
@@ -0,0 +1,121 @@
+import datetime
+
+
+class FinancialYear:
+ """An object representing a financial year.
+
+ Used by ``bcompiler`` internally when creating :py:class:`bcompiler.api.Master` objects.
+ Can be used to calculate start and ends dates and :py:class:`bcompiler.api.Quarter` objects.
+
+ If parameter ``year`` must be in the range 150 - 2100.
+
+ """
+
+ def __init__(self, year):
+ if isinstance(year, int) and (year in range(150, 2100)):
+ self.year = year
+ else:
+ raise ValueError("A year must be an integer between 1950 and 2100")
+ self._generate_quarters()
+ self._q1 = self.quarters[0]
+ self._q2 = self.quarters[1]
+ self._q3 = self.quarters[2]
+ self._q4 = self.quarters[3]
+
+ self.start_date = self.q1.start_date
+ self.end_date = self.q4.end_date
+
+ @property
+ def q1(self) -> datetime.date:
+ """Quarter 1 as a :py:class:`datetime.date` object
+ """
+ return self._q1
+
+ @property
+ def q2(self):
+ """Quarter 2 as a :py:class:`datetime.date` object
+ """
+ return self._q2
+
+ @property
+ def q3(self):
+ """Quarter 3 as a :py:class:`datetime.date` object
+ """
+ return self._q3
+
+ @property
+ def q4(self):
+ """Quarter 4 as a :py:class:`datetime.date` object
+ """
+ return self._q4
+
+ def __str__(self):
+ return f"FY{str(self.year)}/{str(self.year + 1)[2:]}"
+
+ def _generate_quarters(self):
+ self.quarters = [Quarter(x, self.year) for x in range(1, 5)]
+
+
+ def __repr__(self):
+ return f"FinancialYear({self.year})"
+
+
+class Quarter:
+ """An object representing a financial quarter. This is mainly required for building
+ a :py:class:`core.master.Master` object.
+
+ Args:
+ quarter (int): e.g.1, 2, 3 or 4
+ year (int): e.g. 2013
+ """
+ _start_months = {
+ 1: (4, 'April'),
+ 2: (7, 'July'),
+ 3: (10, 'October'),
+ 4: (1, 'January')
+ }
+
+ _end_months = {
+ 1: (6, 'June', 30),
+ 2: (9, 'September', 30),
+ 3: (12, 'December', 31),
+ 4: (3, 'March', 31),
+ }
+
+
+ def __init__(self, quarter: int, year: int) -> None:
+
+ if isinstance(quarter, int) and (quarter >= 1 and quarter <= 4):
+ self.quarter = quarter
+ else:
+ raise ValueError("A quarter must be either 1, 2, 3 or 4")
+
+ if isinstance(year, int) and (year in range(1950, 2100)):
+ self.year = year
+ else:
+ raise ValueError("Year must be between 1950 and 2100 - surely that will do?")
+
+ self.start_date = self._start_date(self.quarter, self.year)
+ self.end_date = self._end_date(self.quarter, self.year)
+
+ def __str__(self):
+ return f"Q{self.quarter} {str(self.year)[2:]}/{str(self.year + 1)[2:]}"
+
+ def _start_date(self, q, y):
+ if q == 4:
+ y = y + 1
+ return datetime.date(y, Quarter._start_months[q][0], 1)
+
+ def _end_date(self, q, y):
+ if q == 4:
+ y = y + 1
+ return datetime.date(y, Quarter._end_months[q][0], Quarter._end_months[q][2])
+
+ def __repr__(self):
+ return f"Quarter({self.quarter}, {self.year})"
+
+ @property
+ def fy(self):
+ """Return a :py:class:`core.temporal.FinancialYear` object.
+ """
+ return FinancialYear(self.year)
diff --git a/datamaps/main.py b/datamaps/main.py
new file mode 100644
index 0000000..b2bd202
--- /dev/null
+++ b/datamaps/main.py
@@ -0,0 +1,131 @@
+"""
+Copyright (c) 2019 Matthew Lemon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE. """
+import logging
+from functools import partial
+
+import click
+
+import colorlog
+from engine.adapters import cli as engine_cli
+from engine.config import Config as engine_config
+from engine.use_cases.parsing import MalFormedCSVHeaderException
+
+logger = colorlog.getLogger("bcompiler")
+logger.setLevel(logging.INFO)
+
+# we want to pass echo func down to bcompiler-engine
+output_funcs = dict(
+ click_echo_green=partial(click.secho, nl=False, fg="green"),
+ click_echo_yellow=partial(click.secho, nl=False, fg="yellow"),
+ click_echo_red=partial(click.secho, nl=False, fg="red"),
+ click_echo_white=partial(click.secho, nl=False, fg="white"),
+)
+
+
+class Config:
+ def __init__(self):
+ self.verbose = False
+
+
+pass_config = click.make_pass_decorator(Config, ensure=True)
+
+
+@click.group()
+@click.option("--verbose", is_flag=True)
+@pass_config
+def cli(config, verbose):
+ """
+ bcompiler is a tool for moving data to and from spreadsheets. See web site, etc.
+ """
+ config.verbose = verbose
+
+
+@cli.group("import")
+def _import():
+ """
+ Import something (a batch of populated templates, a datamap, etc).
+ """
+
+
+@cli.group("export")
+def export():
+ """
+ Export something (master data to blank templates, etc).
+ """
+
+
+@cli.group("report")
+def report():
+ """Create a report"""
+
+
+@_import.command()
+@click.option(
+ "--to-master",
+ "-m",
+ is_flag=True,
+ default=False,
+ help="Create master.xlsx immediately",
+)
+def templates(to_master):
+ engine_config.initialise()
+ click.secho("Hello from bcompiler 2.0!", fg="yellow")
+ if to_master:
+ try:
+ engine_cli.import_and_create_master(echo_funcs=output_funcs)
+ except MalFormedCSVHeaderException as e:
+ click.echo(
+ click.style("Incorrect headers in datamap. {}.".format(e.args[0]), bold=True, reverse=True, fg="cyan"))
+ else:
+ click.secho("Not implemented yet. Try --to-master/-m flag")
+
+
+@export.command()
+# @click.argument("datamap")
+# @click.argument("blank")
+@click.argument("master")
+def master(master):
+ engine_config.initialise()
+
+ input_dir = engine_config.PLATFORM_DOCS_DIR / "input"
+
+ blank_fn = engine_config.config_parser["DEFAULT"]["blank file name"]
+ datamap_fn = engine_config.config_parser["DEFAULT"]["datamap file name"]
+
+ blank = input_dir / blank_fn
+ datamap = input_dir / datamap_fn
+
+ click.secho(f"Exporting master {master} to templates based on {blank}...")
+
+ try:
+ engine_cli.write_master_to_templates(blank, datamap, master)
+ except FileNotFoundError as e:
+ click.secho(str(e), fg="red")
+
+
+@report.command()
+@click.argument("target_file")
+def data_validations(target_file):
+ """Requires the path to the target spreadsheet file."""
+ click.secho(f"Getting data validations from: {target_file}", fg="green")
+ engine_config.initialise()
+ report = engine_cli.report_data_validations_in_file(target_file)
+ for r in report:
+ click.secho(r)
diff --git a/datamaps/process/__init__.py b/datamaps/process/__init__.py
new file mode 100644
index 0000000..12a88fc
--- /dev/null
+++ b/datamaps/process/__init__.py
@@ -0,0 +1 @@
+from .cleansers import Cleanser
diff --git a/datamaps/process/cell.py b/datamaps/process/cell.py
new file mode 100644
index 0000000..edd964b
--- /dev/null
+++ b/datamaps/process/cell.py
@@ -0,0 +1,42 @@
+from typing import Any, Union
+
+
+class Cell:
+ """
+ Purpose of the Cell object is to hold data about a spreadsheet cell.
+ They are used to populate a datamap cell_map and to write out data to
+ a template.
+ """
+ def __init__(self,
+ cell_key: str,
+ cell_value: Any,
+ cell_reference: str,
+ template_sheet: str,
+ bg_colour: Union[str, None],
+ fg_colour: Union[str, None],
+ number_format: Union[str, None],
+ verification_list: Union[str, None],
+ r_idx: int=None,
+ c_idx: int=None
+ ) -> None:
+ if cell_value:
+ self.cell_value = cell_value
+ else:
+ self.cell_value = None
+ self.cell_key = cell_key
+ self.cell_reference = cell_reference
+ self.template_sheet = template_sheet
+ self.bg_colour = bg_colour
+ self.fg_colour = fg_colour
+ self.number_format = number_format
+ self.verification_list = verification_list
+ self.r_idx = r_idx
+ self.c_idx = c_idx
+
+ def __repr__(self) -> str:
+ return ("<Cell: cell_key: {} cell_value: {} cell_reference: {} "
+ "template_sheet: {}>".format(
+ self.cell_key,
+ self.cell_value,
+ self.cell_reference,
+ self.template_sheet))
diff --git a/datamaps/process/cleansers.py b/datamaps/process/cleansers.py
new file mode 100644
index 0000000..d562f9c
--- /dev/null
+++ b/datamaps/process/cleansers.py
@@ -0,0 +1,287 @@
+import datetime
+import re
+from datetime import date
+from operator import itemgetter
+
+import colorlog
+from dateutil.parser import parse
+
+logger = colorlog.getLogger("bcompiler.cleanser")
+
+ENDASH_REGEX = r"–"
+ENDASH_FIX = r"-"
+COMMA_REGEX = r",\s?"
+COMMA_FIX = r" "
+APOS_REGEX = r"^'"
+APOS_FIX = r""
+TRAILING_SPACE_REGEX = r"(.+)( | )$"
+DATE_REGEX = r"^(\d{2,4})(\/|-|\.)(\d{1,2})(\/|-|\.)(\d{2,4})"
+DATE_REGEX_4 = r"^(\d{2,4})(/|-|\.)(\d{1,2})(/|-|\.)(\d{1,2})"
+DATE_REGEX_TIME = r"^(\d{2,4})(/|-)(\d{1,2})(/|-)(\d{1,2})\s(0:00:00)"
+INT_REGEX = r"^[-+]?\d+$"
+FLOAT_REGEX = r"^[-+]?([0-9]*)\.[0-9]+$"
+NL_REGEX = r"\n"
+NL_FIX = r" | "
+SPACE_PIPE_CHAR_REGEX = r"\ \|\S"
+SPACE_PIPE_CHAR_FIX = r" | "
+PERCENT_REGEX = r"^(\d{1,3})%$"
+POUND_REGEX = r"^(-)?£(\d+(\.\d{1,2})?)(\d+)?$" # handles negative numbers
+
+
+class Cleanser:
+ """
+ Takes a string, and cleans it.
+
+ Doctests:
+ >>> t = "Text, with commas"
+ >>> c = Cleanser(t)
+ >>> c.clean()
+ 'Text with commas'
+ >>> a = "\'Text with leading apos."
+ >>> c = Cleanser(a)
+ >>> c.clean()
+ 'Text with leading apos.'
+
+ """
+
+ def __init__(self, string):
+ self.string = string
+
+ # a list of dicts that describe everything needed to fix errors in
+ # string passed to class constructor. Method self.clean() runs through
+ # them, fixing each in turn.
+ self._checks = [
+ dict(
+ c_type="emdash",
+ rule=ENDASH_REGEX,
+ fix=ENDASH_FIX,
+ func=self._endash,
+ count=0,
+ ),
+ dict(
+ c_type="commas",
+ rule=COMMA_REGEX,
+ fix=COMMA_FIX,
+ func=self._commas,
+ count=0,
+ ),
+ dict(
+ c_type="leading_apostrophe",
+ rule=APOS_REGEX,
+ fix=APOS_FIX,
+ func=self._apostrophe,
+ count=0,
+ ),
+ dict(c_type="newline",
+ rule=NL_REGEX,
+ fix=NL_FIX,
+ func=self._newline,
+ count=0),
+ dict(
+ c_type="double_space",
+ rule=" ",
+ fix=" ",
+ func=self._doublespace,
+ count=0,
+ ),
+ dict(
+ c_type="trailing_space",
+ rule=TRAILING_SPACE_REGEX,
+ fix=None,
+ func=self._trailingspace,
+ count=0,
+ ),
+ dict(
+ c_type="pipe_char",
+ rule=SPACE_PIPE_CHAR_REGEX,
+ fix=SPACE_PIPE_CHAR_FIX,
+ func=self._space_pipe_char,
+ count=0,
+ ),
+ dict(c_type="date",
+ rule=DATE_REGEX,
+ fix=None,
+ func=self._date,
+ count=0),
+ dict(
+ c_type="date_time",
+ rule=DATE_REGEX_TIME,
+ fix=None,
+ func=self._date_time,
+ count=0,
+ ),
+ dict(c_type="int",
+ rule=INT_REGEX,
+ fix=None,
+ func=self._int,
+ count=0),
+ dict(c_type="float",
+ rule=FLOAT_REGEX,
+ fix=None,
+ func=self._float,
+ count=0),
+ dict(
+ c_type="percent",
+ rule=PERCENT_REGEX,
+ fix=None,
+ func=self._percent,
+ count=0,
+ ),
+ dict(c_type="pound",
+ rule=POUND_REGEX,
+ fix=None,
+ func=self._pound,
+ count=0),
+ ]
+ self.checks_l = len(self._checks)
+ self._analyse()
+
+ def _sort_checks(self):
+ """
+ Sorts the list of dicts in self._checks by their count, highest
+ first, so that when the fix methods run down them, they always have
+ a count with a value higher than 0 to run with, otherwise later
+ fixes might not get hit.
+ """
+ self._checks = sorted(self._checks,
+ key=itemgetter("count"),
+ reverse=True)
+
+ def _endash(self, regex, fix):
+ """
+ Turns – into -.
+ """
+ return re.sub(regex, fix, self.string)
+
+ def _pound(self, regex, fix):
+ """
+ Turns £12.24 into 12.24 (a float).
+ """
+ m = re.match(regex, self.string)
+ sum_p = m.group(2)
+ if m.group(1) == "-":
+ return float(sum_p) * -1
+ else:
+ return float(sum_p)
+
+ def _percent(self, regex, fix):
+ """
+ Turns 100% into 1.0.
+ """
+ m = re.match(regex, self.string)
+ p = int(m.group(1))
+ return p / 100
+
+ def _float(self, regex, fix):
+ """
+ Turns numbers that look like floats into floats.
+ """
+ return float(self.string)
+
+ def _int(self, regex, fix):
+ """
+ Turns numbers that look like integers into integers.
+ """
+ return int(self.string)
+
+ def _date(self, regex, fix):
+ """
+ Handles dates in "03/05/2016" format.
+ """
+ m = re.match(regex, self.string)
+ if int(m.groups()[-1]) in range(1965, 1967):
+ logger.warning(
+ ("Dates inputted as dd/mm/65 will migrate as dd/mm/2065. "
+ "Dates inputted as dd/mm/66 will migrate as dd/mm/1966."))
+ try:
+ if len(m.string.split("-")[0]) == 4: # year is first
+ return datetime.date(
+ int(m.string.split("-")[0]),
+ int(m.string.split("-")[1]),
+ int(m.string.split("-")[2]),
+ )
+ else:
+ return parse(m.string, dayfirst=True).date()
+ except IndexError:
+ pass
+ except ValueError:
+ logger.warning(
+ 'Potential date issue (perhaps a date mixed with free text?): "{}"'
+ .format(self.string))
+ return self.string
+
+ def _date_time(self, regex, fix):
+ """
+ Handles dates in "2017-05-01 0:00:00" format. We get this from the
+ csv file when we send it back out to templates/forms. Returns a Python
+ date object.
+ """
+ m = re.match(regex, self.string)
+ year = int(m.group(1))
+ month = int(m.group(3))
+ day = int(m.group(5))
+ try:
+ return date(year, month, day)
+ except ValueError:
+ logger.error("Incorrect date format {}!".format(self.string))
+ return self.string
+
+ def _commas(self, regex, fix):
+ """
+ Handles commas in self.string according to rule in self._checks
+ """
+ # we want to sort the list first so self._checks has any item
+ # with a count > 0 up front, otherwise if a count of 0 appears
+ # before it in the list, the > 0 count never gets fixed
+ return re.sub(regex, fix, self.string)
+
+ def _apostrophe(self, regex, fix):
+ """Handles apostrophes as first char of the string."""
+ return self.string.lstrip("'")
+
+ def _newline(self, regex, fix):
+ """Handles newlines anywhere in string."""
+ return re.sub(regex, fix, self.string)
+
+ def _doublespace(self, regex, fix):
+ """Handles double-spaces anywhere in string."""
+ return re.sub(regex, fix, self.string)
+
+ def _trailingspace(self, regex, fix):
+ """Handles trailing space in the string."""
+ return self.string.strip()
+
+ def _space_pipe_char(self, regex, fix):
+ """Handles space pipe char anywhere in string."""
+ return re.sub(regex, fix, self.string)
+
+ def _access_checks(self, c_type):
+ """Helper method returns the index of rule in self._checks
+ when given a c_type"""
+ return self._checks.index(
+ next(item for item in self._checks if item["c_type"] == c_type))
+
+ def _analyse(self):
+ """
+ Uses the self._checks table as a basis for counting the number of
+ each cleaning target required, and calling the appropriate method
+ to clean.
+ """
+ i = 0
+ while i < self.checks_l:
+ matches = re.finditer(self._checks[i]["rule"], self.string)
+ if matches:
+ self._checks[i]["count"] += len(list(matches))
+ i += 1
+
+ def clean(self):
+ """Runs each applicable cleaning action and returns the cleaned
+ string."""
+ self._sort_checks()
+ for check in self._checks:
+ if check["count"] > 0:
+ self.string = check["func"](check["rule"], check["fix"])
+ check["count"] = 0
+ else:
+ return self.string
+ return self.string
diff --git a/datamaps/tests/__init__.py b/datamaps/tests/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/datamaps/tests/__init__.py
diff --git a/datamaps/tests/conftest.py b/datamaps/tests/conftest.py
new file mode 100644
index 0000000..22f20bf
--- /dev/null
+++ b/datamaps/tests/conftest.py
@@ -0,0 +1,9 @@
+import os
+
+import pytest
+from pathlib import Path
+
+
+@pytest.fixture
+def master() -> Path:
+ return Path.cwd() / "tests" / "resources" / "master.xlsx"
diff --git a/datamaps/tests/resources/.~lock.master.xlsx# b/datamaps/tests/resources/.~lock.master.xlsx#
new file mode 100644
index 0000000..7fb98fe
--- /dev/null
+++ b/datamaps/tests/resources/.~lock.master.xlsx#
@@ -0,0 +1 @@
+,lemon,grafter.chutney.lan,27.09.2019 14:05,file:///home/lemon/.config/libreoffice/4; \ No newline at end of file
diff --git a/datamaps/tests/resources/master.xlsx b/datamaps/tests/resources/master.xlsx
new file mode 100644
index 0000000..bf5adf0
--- /dev/null
+++ b/datamaps/tests/resources/master.xlsx
Binary files differ
diff --git a/datamaps/tests/test_api.py b/datamaps/tests/test_api.py
new file mode 100644
index 0000000..273f984
--- /dev/null
+++ b/datamaps/tests/test_api.py
@@ -0,0 +1,10 @@
+import datetime
+
+from ..api import project_data_from_master
+
+
+def test_get_project_data(master):
+ master = project_data_from_master(master, 1, 2019)
+ assert master["Chutney Bridge.xlsm"]["Project/Programme Name"] == "Chutney Bridge Ltd"
+ assert master.quarter.quarter == 1
+ assert master.quarter.end_date == datetime.date(2019, 6, 30)
diff --git a/datamaps/tests/test_cleanser.py b/datamaps/tests/test_cleanser.py
new file mode 100644
index 0000000..9d6df1a
--- /dev/null
+++ b/datamaps/tests/test_cleanser.py
@@ -0,0 +1,41 @@
+import datetime
+from ..process.cleansers import Cleanser
+
+
+def test_cleaning_dot_date():
+ ds = "25.1.72"
+ ds_double = "25.01.72"
+ four_year = "25.01.1972"
+ c = Cleanser(ds)
+ c_double = Cleanser(ds_double)
+ c_four = Cleanser(four_year)
+ assert c.clean() == datetime.date(1972, 1, 25)
+ assert c_double.clean() == datetime.date(1972, 1, 25)
+ assert c_four.clean() == datetime.date(1972, 1, 25)
+
+
+def test_cleaning_slash_date():
+ ds = "25/1/72"
+ ds_double = "25/01/72"
+ four_year = "25/01/1972"
+ c = Cleanser(ds)
+ c_double = Cleanser(ds_double)
+ c_four = Cleanser(four_year)
+ assert c.clean() == datetime.date(1972, 1, 25)
+ assert c_double.clean() == datetime.date(1972, 1, 25)
+ assert c_four.clean() == datetime.date(1972, 1, 25)
+
+
+def test_em_dash_key():
+ contains_em_dash = 'Pre 14-15 BL – Income both Revenue and Capital'
+ c = Cleanser(contains_em_dash)
+ assert c.clean() == 'Pre 14-15 BL - Income both Revenue and Capital'
+
+
+def test_double_trailing_space():
+ contains_double_trailing = 'Pre 14-15 BL - Incoming both Revenue and Capital '
+ contains_single_trailing = 'Pre 14-15 BL - Incoming both Revenue and Capital '
+ c = Cleanser(contains_double_trailing)
+ assert c.clean() == 'Pre 14-15 BL - Incoming both Revenue and Capital'
+ c = Cleanser(contains_single_trailing)
+ assert c.clean() == 'Pre 14-15 BL - Incoming both Revenue and Capital'
diff --git a/datamaps/tests/test_quarter.py b/datamaps/tests/test_quarter.py
new file mode 100644
index 0000000..c5b1824
--- /dev/null
+++ b/datamaps/tests/test_quarter.py
@@ -0,0 +1,52 @@
+import datetime
+
+import pytest
+
+from ..core import Quarter
+
+
+def test_initialisation():
+ q = Quarter(1, 2017)
+ assert q.start_date == datetime.date(2017, 4, 1)
+ assert q.end_date == datetime.date(2017, 6, 30)
+ q = Quarter(2, 2017)
+ assert q.start_date == datetime.date(2017, 7, 1)
+ assert q.end_date == datetime.date(2017, 9, 30)
+ q = Quarter(4, 2017)
+ assert q.start_date == datetime.date(2018, 1, 1)
+ assert q.end_date == datetime.date(2018, 3, 31)
+
+
+def test_desc_string():
+ assert str(Quarter(1, 2013)) == "Q1 13/14"
+ assert str(Quarter(2, 2013)) == "Q2 13/14"
+ assert str(Quarter(3, 2013)) == "Q3 13/14"
+ assert str(Quarter(4, 2013)) == "Q4 13/14"
+
+ assert str(Quarter(1, 1998)) == "Q1 98/99"
+ assert str(Quarter(2, 1998)) == "Q2 98/99"
+ assert str(Quarter(3, 1998)) == "Q3 98/99"
+ assert str(Quarter(4, 1998)) == "Q4 98/99"
+
+ assert str(Quarter(1, 1999)) == "Q1 99/00"
+ assert str(Quarter(2, 1999)) == "Q2 99/00"
+ assert str(Quarter(3, 1999)) == "Q3 99/00"
+ assert str(Quarter(4, 1999)) == "Q4 99/00"
+
+
+def test_errors():
+ with pytest.raises(ValueError) as excinfo:
+ Quarter(5, 2017)
+ assert "A quarter must be either 1, 2, 3 or 4" in str(excinfo.value)
+
+ with pytest.raises(ValueError) as excinfo:
+ Quarter(3, 1921)
+ assert "Year must be between 1950 and 2100 - surely that will do?" in str(excinfo.value)
+
+ with pytest.raises(ValueError) as excinfo:
+ Quarter("3", 2016)
+ assert "A quarter must be either 1, 2, 3 or 4" in str(excinfo.value)
+
+ with pytest.raises(ValueError) as excinfo:
+ Quarter(3, "1921")
+ assert "Year must be between 1950 and 2100 - surely that will do?" in str(excinfo.value)
diff --git a/datamaps/utils.py b/datamaps/utils.py
new file mode 100644
index 0000000..950369e
--- /dev/null
+++ b/datamaps/utils.py
@@ -0,0 +1,564 @@
+import configparser
+import csv
+import fnmatch
+import logging
+import os
+import sys
+from collections import OrderedDict
+from datetime import date, datetime
+from math import isclose
+
+from openpyxl import Workbook, load_workbook
+from openpyxl.styles import PatternFill
+from openpyxl.utils import quote_sheetname
+
+from .process.cleansers import Cleanser
+
+logger = logging.getLogger("bcompiler.utils")
+
+rdel_cdel_merge = ""
+
+DOCS = os.path.join(os.path.expanduser("~"), "Documents")
+BCOMPILER_WORKING_D = "bcompiler"
+ROOT_PATH = os.path.join(DOCS, BCOMPILER_WORKING_D)
+SOURCE_DIR = os.path.join(ROOT_PATH, "source")
+
+CONFIG_FILE = os.path.join(SOURCE_DIR, "config.ini")
+
+runtime_config = configparser.ConfigParser()
+runtime_config.read(CONFIG_FILE)
+
+CURRENT_QUARTER = runtime_config["QuarterData"]["CurrentQuarter"]
+
+try:
+ SHEETS = [
+ i for i in dict((runtime_config.items("TemplateSheets"))).values()
+ ]
+ BLANK_TEMPLATE_FN = runtime_config["BlankTemplate"]["name"]
+except configparser.NoSectionError:
+ print(
+ "There is no config file present. Please run bcompiler-init to initialise bcompiler"
+ )
+ sys.exit()
+
+
+def directory_has_returns_check(dir: str):
+ if os.listdir(dir) == []:
+ logger.critical(
+ "Please copy populated return files to returns directory.")
+ return False
+ else:
+ return True
+
+
+def row_check(excel_file: str):
+ wb = load_workbook(excel_file)
+ data = []
+ for sheet in wb.sheetnames:
+ ws = wb[sheet]
+ rows = ws.rows
+ data.append(
+ dict(
+ workbook=excel_file.split("/")[-1],
+ sheet=sheet,
+ row_count=len(list(rows)),
+ ))
+ return data
+
+
+def row_data_formatter(csv_output=False, quiet=False) -> None:
+ """
+ Prints counts of rows in each sheet in each return spreadsheet.
+ :param: csv_output - provide True to write output to csv file in output
+ directory.
+ :param: quiet - output differing row counts only. Cannot be used with
+ csv_output argument.
+ """
+ if csv_output and quiet:
+ logger.critical("Cannot use --csv and --quiet option. Choose one"
+ " or the other.")
+ return
+ try:
+ returns_dir = os.path.join(ROOT_PATH, "source", "returns")
+ except FileNotFoundError:
+ logger.warning("There is no output directory. Run bcompiler -d to "
+ "set up working directories")
+ try:
+ tmpl_data = row_check(
+ os.path.join(ROOT_PATH, "source", BLANK_TEMPLATE_FN))
+ except FileNotFoundError:
+ logger.warning("bicc_template.xlsm not found")
+ if csv_output:
+ csv_output_path = os.path.join(OUTPUT_DIR, "row_count.csv")
+ csv_output_file = open(csv_output_path, "w", newline="")
+ csv_writer = csv.writer(csv_output_file)
+ logger.info("Writing output to csv file...")
+ elif quiet:
+ logger.info("Looking for anomolies in row counts in each sheet...")
+ else:
+ print("{0:<90}{1:<40}{2:<10}".format("Workbook", "Sheet", "Row Count"))
+ print("{:#<150}".format(""))
+
+ # Start with the bicc_template.xlsm BASE data
+ for line in tmpl_data:
+ if csv_output:
+ csv_writer.writerow(
+ [line["workbook"], line["sheet"], line["row_count"]])
+ elif quiet:
+ pass
+ else:
+ print(
+ f"{line['workbook']:<90}{line['sheet']:<40}{line['row_count']:<10}"
+ )
+ print("{:#<150}".format(""))
+ for f in os.listdir(returns_dir):
+ if fnmatch.fnmatch(f, "*.xlsm"):
+ d = row_check(os.path.join(returns_dir, f))
+ zipped_data = zip(tmpl_data, d)
+ for line in zipped_data:
+ counts = [i["row_count"] for i in line]
+ flag = counts[0] != counts[-1]
+ if not flag:
+ if csv_output:
+ csv_writer.writerow([
+ line[1]["workbook"],
+ line[1]["sheet"],
+ line[1]["row_count"],
+ ])
+ elif quiet:
+ pass
+ else:
+ print(
+ f"{line[1]['workbook']:<90}{line[1]['sheet']:<40}{line[1]['row_count']:<10}"
+ )
+ else:
+ if csv_output:
+ csv_writer.writerow([
+ line[1]["workbook"],
+ line[1]["sheet"],
+ line[1]["row_count"],
+ "INCONSISTENT WITH bicc_template.xlsm",
+ ])
+ else:
+ print(
+ f"{line[1]['workbook']:<90}{line[1]['sheet']:<40}{line[1]['row_count']:<10} *"
+ )
+ if not quiet:
+ print("{:#<150}".format(""))
+ else:
+ print(".")
+ else:
+ logger.critical(f"{f} does not have .xlsm file extension.")
+ if csv_output:
+ print(f"csv output file available at {csv_output_path}")
+ csv_output_file.close()
+
+
+def quick_typechecker(*args):
+ """
+ Very simple function to filter allowed types (int, float). Any other type
+ returns False. All arguments must be of same type.
+ """
+ for arg in args:
+ if isinstance(arg, (int, float, date)):
+ pass
+ else:
+ return False
+ return True
+
+
+def simple_round(fl, prec):
+ """Rounds a fl to prec precision."""
+ return round(fl, prec)
+
+
+def bc_is_close(x, y):
+ """Returns true if acceptably close."""
+ if isinstance(x, (date, datetime)) or isinstance(y, (date, datetime)):
+ return False
+ else:
+ return isclose(x, y, rel_tol=0.001)
+
+
+def cell_bg_colour(rgb=[]):
+ """
+ Give it a list of integers between 0 and 255 - three of them.
+ """
+ c_value = "{0:02X}{1:02X}{2:02X}".format(*rgb)
+ return PatternFill(patternType="solid", fgColor=c_value, bgColor=c_value)
+
+
+def get_relevant_names(project_name, project_data):
+
+ try:
+ sro_first_name = project_data[project_name]["SRO Full Name"].split(
+ " ")[0]
+ except IndexError:
+ logger.warning(
+ "SRO Full Name ({0}) is not suitable for splitting".format(
+ project_data[project_name]["SRO Full Name"]))
+
+ try:
+ sro_last_name = project_data[project_name]["SRO Full Name"].split(
+ " ")[1]
+ except IndexError:
+ logger.warning(
+ "SRO Full Name ({0}) is not suitable for splitting".format(
+ project_data[project_name]["SRO Full Name"]))
+
+ try:
+ pd_first_name = project_data[project_name]["PD Full Name"].split(
+ " ")[0]
+ except IndexError:
+ logger.warning(
+ "PD Full Name ({0}) is not suitable for splitting".format(
+ project_data[project_name]["PD Full Name"]))
+
+ try:
+ pd_last_name = project_data[project_name]["PD Full Name"].split(" ")[1]
+ except IndexError:
+ logger.warning(
+ "PD Full Name ({0}) is not suitable for splitting".format(
+ project_data[project_name]["PD Full Name"]))
+
+ try:
+ sro_d = dict(first_name=sro_first_name, last_name=sro_last_name)
+ except UnboundLocalError:
+ sro_d = None
+ try:
+ pd_d = dict(first_name=pd_first_name, last_name=pd_last_name)
+ except UnboundLocalError:
+ pd_d = None
+
+ return (sro_d, pd_d)
+
+
+def project_data_from_master(master_file: str, opened_wb=False):
+ if opened_wb is False:
+ wb = load_workbook(master_file)
+ ws = wb.active
+ else:
+ wb = master_file
+ ws = wb.active
+ # cleanse the keys
+ for cell in ws["A"]:
+ # we don't want to clean None...
+ if cell.value is None:
+ continue
+ c = Cleanser(cell.value)
+ cell.value = c.clean()
+ p_dict = {}
+ for col in ws.iter_cols(min_col=2):
+ project_name = ""
+ o = OrderedDict()
+ for cell in col:
+ if cell.row == 1:
+ project_name = cell.value
+ p_dict[project_name] = o
+ else:
+ val = ws.cell(row=cell.row, column=1).value
+ if type(cell.value) == datetime:
+ d_value = date(cell.value.year, cell.value.month,
+ cell.value.day)
+ p_dict[project_name][val] = d_value
+ else:
+ p_dict[project_name][val] = cell.value
+ # remove any "None" projects that were pulled from the master
+ try:
+ del p_dict[None]
+ except KeyError:
+ pass
+ return p_dict
+
+
+def project_data_line():
+ p_dict = {}
+ with open(SOURCE_DIR + "master_transposed.csv", "r") as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ key = row.pop("Project/Programme Name")
+ if key in p_dict:
+ pass
+ p_dict[key] = row
+ logger.debug(
+ "Adding {} to project_data_line dictionary".format(key))
+ return p_dict
+
+
+def open_openpyxl_template(template_file):
+ """
+ Opens an xlsx file (the template) and returns the openpyxl object.
+ """
+ wb = load_workbook(template_file, keep_vba=True)
+ logger.info("Opening {} as an openpyxl object".format(template_file))
+ return wb
+
+
+def working_directory(dir_type=None):
+ """
+ Returns the working directory for source files
+ :return: path to the working directory intended for the source files
+ """
+ docs = os.path.join(os.path.expanduser("~"), "Documents")
+ bcomp_working_d = "bcompiler"
+ try:
+ root_path = os.path.join(docs, bcomp_working_d)
+ except FileNotFoundError:
+ print("You need to run with --create-wd to",
+ "create the working directory")
+ if dir_type == "source":
+ return root_path + "/source/"
+ elif dir_type == "output":
+ return root_path + "/output/"
+ elif dir_type == "returns":
+ return root_path + "/source/returns/"
+ else:
+ return
+
+
+# TODO this lot needs cleaning up - no more use of working_directory()
+
+SOURCE_DIR = working_directory("source")
+OUTPUT_DIR = working_directory("output")
+RETURNS_DIR = working_directory("returns")
+DATAMAP_RETURN_TO_MASTER = SOURCE_DIR + "datamap.csv"
+DATAMAP_MASTER_TO_RETURN = SOURCE_DIR + "datamap.csv"
+DATAMAP_MASTER_TO_GMPP = SOURCE_DIR + "archive/datamap-master-to-gmpp"
+CLEANED_DATAMAP = SOURCE_DIR + "cleaned_datamap.csv"
+MASTER = SOURCE_DIR + "master.csv"
+TEMPLATE = SOURCE_DIR + BLANK_TEMPLATE_FN
+GMPP_TEMPLATE = SOURCE_DIR + "archive/gmpp_template.xlsx"
+
+
+def index_returns_directory():
+ """
+ Prior to compiling a master, it is useful to get the order of projects
+ by their file name, as the compile.run() function traverses the directory
+ top to bottom to build the master. We can then use this to compare with the
+ order or projects (columns) in the old master document we are comparing
+ the current compile. This is pretty hackish but needs must...
+ """
+ target_files = []
+ for f in os.listdir(RETURNS_DIR):
+ target_files.append(f)
+
+ pnames_in_returns_dir = []
+ for f in target_files:
+ if fnmatch.fnmatch(f, "*.xlsm"):
+ wb = load_workbook(os.path.join(RETURNS_DIR, f))
+ ws = wb[runtime_config["TemplateSheets"]["summary_sheet"]]
+ pnames_in_returns_dir.append(ws["B5"].value)
+ return pnames_in_returns_dir
+
+
+def splat_rows(row):
+ yield [(c.value, c.row, c.column) for c in row]
+
+
+def parse_csv_to_file(source_file):
+ """
+ Transposes the master to a new master_transposed.csv file.
+ :param source_file:
+ :return:
+ """
+ output = open(SOURCE_DIR + "master_transposed.csv", "w+")
+ try:
+ source = open(source_file, "r")
+ except FileNotFoundError:
+ logger.critical(f"There is no file {source_file} present.")
+ source.close()
+ return
+ with open(source_file, "r") as source_f:
+ lis = [x.split(",") for x in source_f]
+ for i in lis:
+ # we need to do this to remove trailing "\n" from the end of
+ # each original master.csv line
+ i[-1] = i[-1].rstrip()
+
+ for x in zip(*lis):
+ for y in x:
+ output.write(y + ",")
+ output.write("\n")
+ output.close()
+
+
+def create_master_dict_transposed(source_master_csv):
+ """
+ The side-effect of the following function is to ensure there is a
+ 'master_transposed.csv' file present in SOURCE_DIR
+ returns a list of dicts, which makes up all the data from the master
+ """
+ parse_csv_to_file(source_master_csv)
+ with open(SOURCE_DIR + "master_transposed.csv", "r") as f:
+ r = csv.DictReader(f)
+ ls = [row for row in r]
+ return ls
+
+
+sheet_name = "Dropdown"
+
+VALIDATION_REFERENCES = {
+ "Quarter":
+ "{0}!$A$2:$A$9".format(quote_sheetname(sheet_name)),
+ "Joining Qtr":
+ "{0}!$B$2:$B$25".format(quote_sheetname(sheet_name)),
+ "Classification":
+ "{0}!$C$2:$C$4".format(quote_sheetname(sheet_name)),
+ "Entity format":
+ "{0}!$D$2:$D$4".format(quote_sheetname(sheet_name)),
+ "Methodology":
+ "{0}!$E$2:$E$10".format(quote_sheetname(sheet_name)),
+ "Category":
+ "{0}!$F$2:$H$11".format(quote_sheetname(sheet_name)),
+ "Scope Changed":
+ "{0}!$G$2:$I$4".format(quote_sheetname(sheet_name)),
+ "Monetised / Non Monetised Benefits":
+ "{0}!$H$2:$H$4".format(quote_sheetname(sheet_name)),
+ "RAG":
+ "{0}!$I$2:$I$6".format(quote_sheetname(sheet_name)),
+ "RAG 2":
+ "{0}!$J$2:$J$4".format(quote_sheetname(sheet_name)),
+ "RPA level":
+ "{0}!$K$2:$K$4".format(quote_sheetname(sheet_name)),
+ "Capability RAG":
+ "{0}!$L$2:$L$5".format(quote_sheetname(sheet_name)),
+ "MPLA / PLP":
+ "{0}!$M$2:$M$30".format(quote_sheetname(sheet_name)),
+ "PL Changes":
+ "{0}!$N$2:$N$31".format(quote_sheetname(sheet_name)),
+ "Stage":
+ "{0}!$O$2:$O$10".format(quote_sheetname(sheet_name)),
+ "Business Cases":
+ "{0}!$P$2:$P$11".format(quote_sheetname(sheet_name)),
+ "Milestone Types":
+ "{0}!$Q$2:$Q$4".format(quote_sheetname(sheet_name)),
+ "Finance figures format":
+ "{0}!$R$2:$R$3".format(quote_sheetname(sheet_name)),
+ "Index Years":
+ "{0}!$S$2:$S$27".format(quote_sheetname(sheet_name)),
+ "Discount Rate":
+ "{0}!$T$2:$T$32".format(quote_sheetname(sheet_name)),
+ "Finance type":
+ "{0}!$U$2:$U$6".format(quote_sheetname(sheet_name)),
+ "Yes/No":
+ "{0}!$V$2:$V$3".format(quote_sheetname(sheet_name)),
+ "Years (Spend)":
+ "{0}!$W$2:$W$90".format(quote_sheetname(sheet_name)),
+ "Years (Benefits)":
+ "{0}!$X$2:$X$90".format(quote_sheetname(sheet_name)),
+ "Snapshot Dates":
+ "{0}!$Y$2:$Y$9".format(quote_sheetname(sheet_name)),
+ "Percentage of time spent on SRO role":
+ "{0}!$Z$2:$Z$21".format(quote_sheetname(sheet_name)),
+ "AR Category":
+ "{0}!$AA$2:$AA$5".format(quote_sheetname(sheet_name)),
+ "Project Lifecycle":
+ "{0}!$AB$2:$AB$6".format(quote_sheetname(sheet_name)),
+ "Programme Lifecycle":
+ "{0}!$AC$2:$AC$7".format(quote_sheetname(sheet_name)),
+ "Other":
+ "{0}!$AD$2:$AD$19".format(quote_sheetname(sheet_name)),
+ "Start / Year end - FY":
+ "{0}!$AE$3:$AE$22".format(quote_sheetname(sheet_name)),
+ "Count":
+ "{0}!$AF$2:$AF$22".format(quote_sheetname(sheet_name)),
+ "VFM":
+ "{0}!$AG$2:$AG$11".format(quote_sheetname(sheet_name)),
+ "DfT Group":
+ "{0}!$AH$2:$AH$7".format(quote_sheetname(sheet_name)),
+ "DfT Division":
+ "{0}!$AI$2:$AI$15".format(quote_sheetname(sheet_name)),
+ "Agency":
+ "{0}!$AJ$2:$AJ$9".format(quote_sheetname(sheet_name)),
+ "High Speed Rail":
+ "{0}!$AK$2:$AK$4".format(quote_sheetname(sheet_name)),
+ "Rail Group":
+ "{0}!$AL$2:$AL$4".format(quote_sheetname(sheet_name)),
+ "Roads, Devolution & Motoring":
+ "{0}!$AM$2:$AM$5".format(quote_sheetname(sheet_name)),
+ "International, Security and Environment":
+ "{0}!$AN$2:$AN$4".format(quote_sheetname(sheet_name)),
+ "Resource and Strategy":
+ "{0}!$AO$2:$AO$2".format(quote_sheetname(sheet_name)),
+ "Non-Group":
+ "{0}!$AP$2:$AP$2".format(quote_sheetname(sheet_name)),
+ "GMPP Annual Report Category":
+ "{0}!$AQ$2:$AQ$2".format(quote_sheetname(sheet_name)),
+ "SDP":
+ "{0}!$AR2:$AR$5".format(quote_sheetname(sheet_name)),
+}
+
+
+def row_accessor(row: tuple):
+ """
+ Utility generator yielding tuple of form (str, str); e.g
+ ('A10', 'Project/Programme Name').
+ :param row:
+ :return:
+ """
+ for item in row:
+ yield ("".join([item.column, str(item.row)]), item.value)
+
+
+def gen_sheet_data(workbook: str) -> dict:
+ """
+ Returns a dict containing data from a given xlsx file, by sheet
+ within that workbook.
+ :param path to xlsx file:
+ :return: dict of data by sheet in workbook
+ """
+ wb = load_workbook(workbook)
+ sheets = wb._sheets
+ data = {}
+ for s in sheets:
+ rows = s.rows
+ title = s.title
+ data[title] = [list(row_accessor(x)) for x in rows]
+ return data
+
+
+def parse_data_row(row: list) -> tuple:
+ """
+ Utility generator which processes two-item tuples in a list.
+ :param row:
+ :return: tuple of form (str, str); e.g. ('A10', 'Project/Programme Name')
+ """
+ for item in row:
+ yield item[0], item[1]
+
+
+def get_sheets_in_workbook(real_template: str) -> list:
+ """
+ Utility function to return a list of sheet names from an xlsx file.
+ :param real_template:
+ :return: list of sheet names
+ """
+ wb = load_workbook(real_template)
+ sheets = wb._sheets
+ return sheets
+
+
+def generate_test_template_from_real(real_template: str,
+ save_path: str) -> None:
+ """
+ Given the bicc_template.xlsm file, this function strips it of
+ everything but cell data.
+ :param real_template: str path of location of bicc_template.xlsm
+ :param save_path: str path of output directory; file will be named 'gen_bicc_template.xlsm',
+ of the form "~/Documents"
+ :return:
+ """
+ data = gen_sheet_data(real_template)
+ sheets = get_sheets_in_workbook(real_template)
+ blank = Workbook()
+ sheet_order = 0
+ for sheet in sheets:
+ summary_sheet = blank.create_sheet(sheet.title, sheet_order)
+ for row in data[sheet.title]:
+ r = parse_data_row(row)
+ for cell in r:
+ summary_sheet[cell[0]] = cell[1]
+ sheet_order += 1
+ if save_path.endswith("/"):
+ save_path = save_path[:-1]
+ blank.save("".join([save_path, "/gen_bicc_template.xlsm"]))
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e4e8b5e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+import os
+
+from setuptools import find_packages, setup
+
+import datamaps
+
+
+def read(*names):
+ values = dict()
+ extensions = ['.txt', '.rst']
+ for name in names:
+ value = ''
+ for extension in extensions:
+ filename = name + extension
+ if os.path.isfile(filename):
+ value = open(name + extension).read()
+ break
+ values[name] = value
+ return values
+
+
+long_description = """
+%(README)s
+
+News
+====
+
+%(CHANGES)s
+
+""" % read('README', 'CHANGES')
+
+setup(
+ name='datamaps',
+ version=datamaps.__version__,
+ description='Collect and clean data using Excel spreadsheets.',
+ long_description=long_description,
+ classifiers=[
+ "Development Status :: 3 - Alpha",
+ "Environment :: Console",
+ "Intended Audience :: Other Audience",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Topic :: Text Processing",
+ ],
+ keywords='data console commandline excel',
+ author='Matthew Lemon',
+ author_email='matt@matthewlemon.com',
+ maintainer='Matthew Lemon',
+ maintainer_email='matt@matthewlemon.com',
+ url='https://github.com/hammerheadlemon/datamaps',
+ packages=find_packages(),
+ python_requires='>=3.6',
+ entry_points={'console_scripts': [
+ 'datamaps = datamaps.main:cli'
+ ]},
+ setup_requires=['wheel'],
+ install_requires=[
+ 'click',
+ 'colorlog',
+ 'python-dateutil',
+ 'bcompiler-engine @ https://github.com/hammerheadlemon/bcompiler-engine/archive/master.zip#egg=bcompiler-engine'
+ ],
+ test_suite='datamaps.tests')