diff options
author | Matthew Lemon <lemon@matthewlemon.com> | 2019-09-27 15:12:57 +0100 |
---|---|---|
committer | Matthew Lemon <lemon@matthewlemon.com> | 2019-09-27 15:12:57 +0100 |
commit | d4272b28da76c2d1fe110d611ef08a445f11ceb2 (patch) | |
tree | af01f431bd39504bb2d9234c7c6a41879b95051f |
initial commit
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Pipfile | 18 | ||||
-rw-r--r-- | Pipfile.lock | 159 | ||||
-rw-r--r-- | datamaps/__init__.py | 1 | ||||
-rw-r--r-- | datamaps/api/__init__.py | 5 | ||||
-rw-r--r-- | datamaps/api/api.py | 14 | ||||
-rw-r--r-- | datamaps/core/__init__.py | 3 | ||||
-rw-r--r-- | datamaps/core/master.py | 208 | ||||
-rw-r--r-- | datamaps/core/row.py | 84 | ||||
-rw-r--r-- | datamaps/core/temporal.py | 121 | ||||
-rw-r--r-- | datamaps/main.py | 131 | ||||
-rw-r--r-- | datamaps/process/__init__.py | 1 | ||||
-rw-r--r-- | datamaps/process/cell.py | 42 | ||||
-rw-r--r-- | datamaps/process/cleansers.py | 287 | ||||
-rw-r--r-- | datamaps/tests/__init__.py | 0 | ||||
-rw-r--r-- | datamaps/tests/conftest.py | 9 | ||||
-rw-r--r-- | datamaps/tests/resources/.~lock.master.xlsx# | 1 | ||||
-rw-r--r-- | datamaps/tests/resources/master.xlsx | bin | 0 -> 5889 bytes | |||
-rw-r--r-- | datamaps/tests/test_api.py | 10 | ||||
-rw-r--r-- | datamaps/tests/test_cleanser.py | 41 | ||||
-rw-r--r-- | datamaps/tests/test_quarter.py | 52 | ||||
-rw-r--r-- | datamaps/utils.py | 564 | ||||
-rw-r--r-- | setup.py | 66 |
23 files changed, 1818 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f11b75 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ @@ -0,0 +1,18 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] +pytest = "*" + +[packages] +openpyxl = "*" +bcompiler-engine = {editable = true,path = "/home/lemon/code/python/bcompiler-engine"} +colorlog = "*" +python-dateutil = "*" +click = "*" +datamaps = {editable = true,path = "."} + +[requires] +python_version = "3.8" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..101c7c4 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,159 @@ +{ + "_meta": { + "hash": { + "sha256": "98952e013f654efa60ba726c96572096e1bdaa9f45d92d0bb65d9faf905bffa2" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.8" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "appdirs": { + "hashes": [ + "sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92", + "sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e" + ], + "version": "==1.4.3" + }, + "bcompiler-engine": { + "editable": true, + "path": "/home/lemon/code/python/bcompiler-engine" + }, + "click": { + "hashes": [ + "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", + "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7" + ], + "index": "pypi", + "version": "==7.0" + }, + "colorlog": { + "hashes": [ + "sha256:3cf31b25cbc8f86ec01fef582ef3b840950dea414084ed19ab922c8b493f9b42", + "sha256:450f52ea2a2b6ebb308f034ea9a9b15cea51e65650593dca1da3eb792e4e4981" + ], + "index": "pypi", + "version": "==4.0.2" + }, + "datamaps": { + "editable": true, + "path": "." + }, + "et-xmlfile": { + "hashes": [ + "sha256:614d9722d572f6246302c4491846d2c393c199cfa4edc9af593437691683335b" + ], + "version": "==1.0.1" + }, + "jdcal": { + "hashes": [ + "sha256:1abf1305fce18b4e8aa248cf8fe0c56ce2032392bc64bbd61b5dff2a19ec8bba", + "sha256:472872e096eb8df219c23f2689fc336668bdb43d194094b5cc1707e1640acfc8" + ], + "version": "==1.4.1" + }, + "openpyxl": { + "hashes": [ + "sha256:340a1ab2069764559b9d58027a43a24db18db0e25deb80f81ecb8ca7ee5253db" + ], + "index": "pypi", + "version": "==3.0.0" + }, + "python-dateutil": { + "hashes": [ + "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", + "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" + ], + "index": "pypi", + "version": "==2.8.0" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + } + }, + "develop": { + "atomicwrites": { + "hashes": [ + "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", + "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" + ], + "version": "==1.3.0" + }, + "attrs": { + "hashes": [ + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" + ], + "version": "==19.1.0" + }, + "more-itertools": { + "hashes": [ + "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832", + "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4" + ], + "version": "==7.2.0" + }, + "packaging": { + "hashes": [ + "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47", + "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108" + ], + "version": "==19.2" + }, + "pluggy": { + "hashes": [ + "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6", + "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34" + ], + "version": "==0.13.0" + }, + "py": { + "hashes": [ + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + ], + "version": "==1.8.0" + }, + "pyparsing": { + "hashes": [ + "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80", + "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4" + ], + "version": "==2.4.2" + }, + "pytest": { + "hashes": [ + "sha256:813b99704b22c7d377bbd756ebe56c35252bb710937b46f207100e843440b3c2", + "sha256:cc6620b96bc667a0c8d4fa592a8c9c94178a1bd6cc799dbb057dfd9286d31a31" + ], + "index": "pypi", + "version": "==5.1.3" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + } + } +} diff --git a/datamaps/__init__.py b/datamaps/__init__.py new file mode 100644 index 0000000..c9bd737 --- /dev/null +++ b/datamaps/__init__.py @@ -0,0 +1 @@ +__version__ = "1.0.0b0" diff --git a/datamaps/api/__init__.py b/datamaps/api/__init__.py new file mode 100644 index 0000000..bb48fdb --- /dev/null +++ b/datamaps/api/__init__.py @@ -0,0 +1,5 @@ +from .api import project_data_from_master_api as project_data_from_master +from ..core.master import Master as Master +from ..core.row import Row as Row +from ..core.temporal import FinancialYear as FinancialYear +from ..core.temporal import Quarter as Quarter diff --git a/datamaps/api/api.py b/datamaps/api/api.py new file mode 100644 index 0000000..59458e7 --- /dev/null +++ b/datamaps/api/api.py @@ -0,0 +1,14 @@ +from ..core import Master, Quarter + + +def project_data_from_master_api(master_file: str, quarter: int, year: int): + """Create a Master object directly without the need to explicitly pass + a Quarter object. + + Args: + master_file (str): the path to a master file + quarter (int): an integer representing the financial quarter + year (int): an integer representing the year + """ + m = Master(Quarter(quarter, year), master_file) + return m diff --git a/datamaps/core/__init__.py b/datamaps/core/__init__.py new file mode 100644 index 0000000..305208d --- /dev/null +++ b/datamaps/core/__init__.py @@ -0,0 +1,3 @@ +from .row import Row +from .temporal import Quarter, FinancialYear +from .master import Master, ProjectData diff --git a/datamaps/core/master.py b/datamaps/core/master.py new file mode 100644 index 0000000..1fbfe90 --- /dev/null +++ b/datamaps/core/master.py @@ -0,0 +1,208 @@ +import re +import datetime +import logging +import unicodedata +from pathlib import Path +from typing import List, Tuple, Iterable, Optional, Any + +from ..utils import project_data_from_master +from ..process.cleansers import DATE_REGEX_4 +from .temporal import Quarter + +from openpyxl import load_workbook + +logger = logging.getLogger('bcompiler.utils') + + +class ProjectData: + """ + ProjectData class + """ + def __init__(self, d: dict) -> None: + """ + :py:func:`OrderedDict` is easiest to get from project_data_from_master[x] + """ + self._data = d + + def __len__(self) -> int: + return len(self._data) + + def __getitem__(self, item): + return self._data[item] + + def key_filter(self, key: str) -> List[Tuple]: + """ + Return a list of (k, v) tuples if k in master key. + """ + data = [item for item in self._data.items() if key in item[0]] + if not data: + raise KeyError("Sorry, there is no matching data") + return (data) + + def pull_keys(self, input_iter: Iterable, flat=False) -> List[Tuple[Any, ...]]: + """ + Returns a list of (key, value) tuples from ProjectData if key matches a + key. The order of tuples is based on the order of keys passed in the iterable. + """ + if flat is True: + # search and replace troublesome EN DASH character + xs = [item for item in self._data.items() + for i in input_iter if item[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i] + xs = [_convert_str_date_to_object(x) for x in xs] + ts = sorted(xs, key=lambda x: input_iter.index(x[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')))) + ts = [item[1] for item in ts] + return ts + else: + xs = [item for item in self._data.items() + for i in input_iter if item[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i] + xs = [item for item in self._data.items() + for i in input_iter if item[0] == i] + xs = [_convert_str_date_to_object(x) for x in xs] + ts = sorted(xs, key=lambda x: input_iter.index(x[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')))) + return ts + + def __repr__(self): + return f"ProjectData() - with data: {id(self._data)}" + + +def _convert_str_date_to_object(d_str: tuple) -> Tuple[str, Optional[datetime.date]]: + try: + if re.match(DATE_REGEX_4, d_str[1]): + try: + ds = d_str[1].split('-') + return (d_str[0], datetime.date(int(ds[0]), int(ds[1]), int(ds[2]))) + except TypeError: + return d_str + else: + return d_str + except TypeError: + return d_str + + +class Master: + """A Master object, representing the main central data item in ``bcompiler``. + + Args: + quarter (:py:class:`bcompiler.api.Quarter`): creating using ``Quarter(1, 2017)`` for example. + path (str): path to the master xlsx file + + A master object is a composition between a :py:class:`bcompiler.api.Quarter` object and an + actual master xlsx file on disk. + + You create one, either by creating the Quarter object first, and using that as the first + parameter of the ``Master`` constructor, e.g.:: + + from bcompiler.api import Quarter + from bcompiler.api import Master + + q1 = Quarter(1, 2016) + m1 = Master(q1, '/tmp/master_1_2016.xlsx') + + or by doing both in one:: + + m1 = Master(Quarter(1, 2016), '/tmp/master_1_2016.xlsx') + + Once you have a ``Master`` object, you can access project data from it, like this:: + + project_data = m1['Project Title'] + + + The following *attributes* are available on `m1` once created as such, e.g.:: + + data = m1.data + quarter = m1.quarter + filename = m1.filename + ..etc + """ + def __init__(self, quarter: Quarter, path: str) -> None: + self._quarter = quarter + self.path = path + self._data = project_data_from_master(self.path) + self._project_titles = [item for item in self.data.keys()] + self.year = self._quarter.year + + def __getitem__(self, project_name): + return ProjectData(self._data[project_name]) + + @property + def data(self): + """Return all the data contained in the master in a large, nested dictionary. + + The resulting data structure contains a dictionary of :py:class:`colletions.OrderedDict` items whose + key is the name of a project:: + + "Project Name": OrderedDict("key": "value" + ...) + + This object can then be further interrogated, for example to obtain all key/values + from a partictular project, by doing:: + + d = Master.data + project_data = d['PROJECT_NAME'] + + """ + return self._data + + @property + def quarter(self): + """Returns the ``Quarter`` object associated with the ``Master``. + + Example:: + + q1 = m.quarter + + ``q1`` can then be further interrogated as documented in :py:class:`core.temporal.Quarter`. + + """ + + return self._quarter + + @property + def filename(self): + """The filename of the master xlsx file, e.g. ``master_1_2017.xlsx``. + """ + p = Path(self.path) + return p.name + + @property + def projects(self): + """A list of project titles derived from the master xlsx. + """ + return self._project_titles + + def duplicate_keys(self, to_log=None): + """Checks for duplicate keys in a master xlsx file. + + Args: + to_log (bool): Optional True or False, depending on whether you want to see duplicates reported in a ``WARNING`` log message. This is used mainly for internal purposes within ``bcompiler``. + + Returns: + duplicates (set): a set of duplicated keys + """ + wb = load_workbook(self.path) + ws = wb.active + col_a = next(ws.iter_cols()) + col_a = [item.value for item in col_a] + seen: set = set() + uniq = [] + dups: set = set() + for x in col_a: + if x not in seen: + uniq.append(x) + seen.add(x) + else: + dups.add(x) + if to_log and len(dups) > 0: + for x in dups: + logger.warning(f"{self.path} contains duplicate key: \"{x}\". Masters cannot contain duplicate keys. Rename them.") + return True + elif to_log and len(dups) == 0: + logger.info(f"No duplicate keys in {self.path}") + return False + elif len(dups) > 0: + return dups + else: + return False + + def __repr__(self): + return f"Master({self.path}, {self.quarter.quarter}, {self.quarter.year})" diff --git a/datamaps/core/row.py b/datamaps/core/row.py new file mode 100644 index 0000000..470b753 --- /dev/null +++ b/datamaps/core/row.py @@ -0,0 +1,84 @@ +import string +import datetime + +from ..process.cell import Cell +from typing import Iterable + +from itertools import chain + + +class Row: + """ + A Row object is populated with an iterable (list or other sequence), bound + to an openpyxl worksheet. It is used to populate a row of cells in an output + Excel file with the values from the iterable. + + The ``anchor_column`` and ``anchor_row`` parameters represent the coordinates of + a cell which form the *leftmost* cell of the row, i.e. to set the row of data + to start at the very top left of a sheet, you'd create the ``Row()`` object this:: + + r = Row(1, 1, interable) + r.bind(ws) + """ + + def __init__(self, anchor_column: int, anchor_row: int, seq: Iterable): + if isinstance(anchor_column, str): + if len(anchor_column) == 1: + enumerated_alphabet = list(enumerate(string.ascii_uppercase, start=1)) + col_letter = [x for x in enumerated_alphabet if x[1] == anchor_column][0] + self._anchor_column = col_letter[0] + self._anchor_row = anchor_row + self._cell_map = [] + elif len(anchor_column) == 2: + enumerated_alphabet = list(enumerate(list(chain( + string.ascii_uppercase, ["{}{}".format(x[0], x[1]) for x in list(zip(['A'] * 26, string.ascii_uppercase))])), start=1)) + col_letter = [x for x in enumerated_alphabet if x[1] == anchor_column][0] + self._anchor_column = col_letter[0] + self._anchor_row = anchor_row + self._cell_map = [] + else: + raise ValueError("You can only have a column up to AZ") + else: + self._anchor_column = anchor_column + self._anchor_row = anchor_row + self._cell_map = [] + self._seq = seq + + + def _basic_bind(self, ws): + for x in list(enumerate(self._seq, start=self._anchor_column)): + self._ws.cell(row=self._anchor_row, column=x[0], value=x[1]) + + + def _cell_bind(self, ws): + self._cell_map = [] + for x in list(enumerate(self._seq, start=self._anchor_column)): + self._cell_map.append( + Cell( + cell_key="", + cell_value=x[1], + cell_reference=f"{self._anchor_column}{self._anchor_row}", + template_sheet=ws, + bg_colour=None, + fg_colour=None, + number_format=None, + verification_list=None, + r_idx=self._anchor_row, + c_idx=x[0] + ) + ) + for c in self._cell_map: + if not isinstance(c.cell_value, datetime.date) and not None: + self._ws.cell(row=c.r_idx, column=c.c_idx, value=c.cell_value).number_format = '0' + else: + self._ws.cell(row=c.r_idx, column=c.c_idx, value=c.cell_value) + + + + def bind(self, worksheet): + """Bind the Row to a particular worksheetl, which effectively does the + printing of data into cells. Must be done prior to saving the workbook. + """ + self._ws = worksheet +# self._basic_bind(self._ws) + self._cell_bind(self._ws) diff --git a/datamaps/core/temporal.py b/datamaps/core/temporal.py new file mode 100644 index 0000000..ddb7004 --- /dev/null +++ b/datamaps/core/temporal.py @@ -0,0 +1,121 @@ +import datetime + + +class FinancialYear: + """An object representing a financial year. + + Used by ``bcompiler`` internally when creating :py:class:`bcompiler.api.Master` objects. + Can be used to calculate start and ends dates and :py:class:`bcompiler.api.Quarter` objects. + + If parameter ``year`` must be in the range 150 - 2100. + + """ + + def __init__(self, year): + if isinstance(year, int) and (year in range(150, 2100)): + self.year = year + else: + raise ValueError("A year must be an integer between 1950 and 2100") + self._generate_quarters() + self._q1 = self.quarters[0] + self._q2 = self.quarters[1] + self._q3 = self.quarters[2] + self._q4 = self.quarters[3] + + self.start_date = self.q1.start_date + self.end_date = self.q4.end_date + + @property + def q1(self) -> datetime.date: + """Quarter 1 as a :py:class:`datetime.date` object + """ + return self._q1 + + @property + def q2(self): + """Quarter 2 as a :py:class:`datetime.date` object + """ + return self._q2 + + @property + def q3(self): + """Quarter 3 as a :py:class:`datetime.date` object + """ + return self._q3 + + @property + def q4(self): + """Quarter 4 as a :py:class:`datetime.date` object + """ + return self._q4 + + def __str__(self): + return f"FY{str(self.year)}/{str(self.year + 1)[2:]}" + + def _generate_quarters(self): + self.quarters = [Quarter(x, self.year) for x in range(1, 5)] + + + def __repr__(self): + return f"FinancialYear({self.year})" + + +class Quarter: + """An object representing a financial quarter. This is mainly required for building + a :py:class:`core.master.Master` object. + + Args: + quarter (int): e.g.1, 2, 3 or 4 + year (int): e.g. 2013 + """ + _start_months = { + 1: (4, 'April'), + 2: (7, 'July'), + 3: (10, 'October'), + 4: (1, 'January') + } + + _end_months = { + 1: (6, 'June', 30), + 2: (9, 'September', 30), + 3: (12, 'December', 31), + 4: (3, 'March', 31), + } + + + def __init__(self, quarter: int, year: int) -> None: + + if isinstance(quarter, int) and (quarter >= 1 and quarter <= 4): + self.quarter = quarter + else: + raise ValueError("A quarter must be either 1, 2, 3 or 4") + + if isinstance(year, int) and (year in range(1950, 2100)): + self.year = year + else: + raise ValueError("Year must be between 1950 and 2100 - surely that will do?") + + self.start_date = self._start_date(self.quarter, self.year) + self.end_date = self._end_date(self.quarter, self.year) + + def __str__(self): + return f"Q{self.quarter} {str(self.year)[2:]}/{str(self.year + 1)[2:]}" + + def _start_date(self, q, y): + if q == 4: + y = y + 1 + return datetime.date(y, Quarter._start_months[q][0], 1) + + def _end_date(self, q, y): + if q == 4: + y = y + 1 + return datetime.date(y, Quarter._end_months[q][0], Quarter._end_months[q][2]) + + def __repr__(self): + return f"Quarter({self.quarter}, {self.year})" + + @property + def fy(self): + """Return a :py:class:`core.temporal.FinancialYear` object. + """ + return FinancialYear(self.year) diff --git a/datamaps/main.py b/datamaps/main.py new file mode 100644 index 0000000..b2bd202 --- /dev/null +++ b/datamaps/main.py @@ -0,0 +1,131 @@ +""" +Copyright (c) 2019 Matthew Lemon + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. """ +import logging +from functools import partial + +import click + +import colorlog +from engine.adapters import cli as engine_cli +from engine.config import Config as engine_config +from engine.use_cases.parsing import MalFormedCSVHeaderException + +logger = colorlog.getLogger("bcompiler") +logger.setLevel(logging.INFO) + +# we want to pass echo func down to bcompiler-engine +output_funcs = dict( + click_echo_green=partial(click.secho, nl=False, fg="green"), + click_echo_yellow=partial(click.secho, nl=False, fg="yellow"), + click_echo_red=partial(click.secho, nl=False, fg="red"), + click_echo_white=partial(click.secho, nl=False, fg="white"), +) + + +class Config: + def __init__(self): + self.verbose = False + + +pass_config = click.make_pass_decorator(Config, ensure=True) + + +@click.group() +@click.option("--verbose", is_flag=True) +@pass_config +def cli(config, verbose): + """ + bcompiler is a tool for moving data to and from spreadsheets. See web site, etc. + """ + config.verbose = verbose + + +@cli.group("import") +def _import(): + """ + Import something (a batch of populated templates, a datamap, etc). + """ + + +@cli.group("export") +def export(): + """ + Export something (master data to blank templates, etc). + """ + + +@cli.group("report") +def report(): + """Create a report""" + + +@_import.command() +@click.option( + "--to-master", + "-m", + is_flag=True, + default=False, + help="Create master.xlsx immediately", +) +def templates(to_master): + engine_config.initialise() + click.secho("Hello from bcompiler 2.0!", fg="yellow") + if to_master: + try: + engine_cli.import_and_create_master(echo_funcs=output_funcs) + except MalFormedCSVHeaderException as e: + click.echo( + click.style("Incorrect headers in datamap. {}.".format(e.args[0]), bold=True, reverse=True, fg="cyan")) + else: + click.secho("Not implemented yet. Try --to-master/-m flag") + + +@export.command() +# @click.argument("datamap") +# @click.argument("blank") +@click.argument("master") +def master(master): + engine_config.initialise() + + input_dir = engine_config.PLATFORM_DOCS_DIR / "input" + + blank_fn = engine_config.config_parser["DEFAULT"]["blank file name"] + datamap_fn = engine_config.config_parser["DEFAULT"]["datamap file name"] + + blank = input_dir / blank_fn + datamap = input_dir / datamap_fn + + click.secho(f"Exporting master {master} to templates based on {blank}...") + + try: + engine_cli.write_master_to_templates(blank, datamap, master) + except FileNotFoundError as e: + click.secho(str(e), fg="red") + + +@report.command() +@click.argument("target_file") +def data_validations(target_file): + """Requires the path to the target spreadsheet file.""" + click.secho(f"Getting data validations from: {target_file}", fg="green") + engine_config.initialise() + report = engine_cli.report_data_validations_in_file(target_file) + for r in report: + click.secho(r) diff --git a/datamaps/process/__init__.py b/datamaps/process/__init__.py new file mode 100644 index 0000000..12a88fc --- /dev/null +++ b/datamaps/process/__init__.py @@ -0,0 +1 @@ +from .cleansers import Cleanser diff --git a/datamaps/process/cell.py b/datamaps/process/cell.py new file mode 100644 index 0000000..edd964b --- /dev/null +++ b/datamaps/process/cell.py @@ -0,0 +1,42 @@ +from typing import Any, Union + + +class Cell: + """ + Purpose of the Cell object is to hold data about a spreadsheet cell. + They are used to populate a datamap cell_map and to write out data to + a template. + """ + def __init__(self, + cell_key: str, + cell_value: Any, + cell_reference: str, + template_sheet: str, + bg_colour: Union[str, None], + fg_colour: Union[str, None], + number_format: Union[str, None], + verification_list: Union[str, None], + r_idx: int=None, + c_idx: int=None + ) -> None: + if cell_value: + self.cell_value = cell_value + else: + self.cell_value = None + self.cell_key = cell_key + self.cell_reference = cell_reference + self.template_sheet = template_sheet + self.bg_colour = bg_colour + self.fg_colour = fg_colour + self.number_format = number_format + self.verification_list = verification_list + self.r_idx = r_idx + self.c_idx = c_idx + + def __repr__(self) -> str: + return ("<Cell: cell_key: {} cell_value: {} cell_reference: {} " + "template_sheet: {}>".format( + self.cell_key, + self.cell_value, + self.cell_reference, + self.template_sheet)) diff --git a/datamaps/process/cleansers.py b/datamaps/process/cleansers.py new file mode 100644 index 0000000..d562f9c --- /dev/null +++ b/datamaps/process/cleansers.py @@ -0,0 +1,287 @@ +import datetime +import re +from datetime import date +from operator import itemgetter + +import colorlog +from dateutil.parser import parse + +logger = colorlog.getLogger("bcompiler.cleanser") + +ENDASH_REGEX = r"–" +ENDASH_FIX = r"-" +COMMA_REGEX = r",\s?" +COMMA_FIX = r" " +APOS_REGEX = r"^'" +APOS_FIX = r"" +TRAILING_SPACE_REGEX = r"(.+)( | )$" +DATE_REGEX = r"^(\d{2,4})(\/|-|\.)(\d{1,2})(\/|-|\.)(\d{2,4})" +DATE_REGEX_4 = r"^(\d{2,4})(/|-|\.)(\d{1,2})(/|-|\.)(\d{1,2})" +DATE_REGEX_TIME = r"^(\d{2,4})(/|-)(\d{1,2})(/|-)(\d{1,2})\s(0:00:00)" +INT_REGEX = r"^[-+]?\d+$" +FLOAT_REGEX = r"^[-+]?([0-9]*)\.[0-9]+$" +NL_REGEX = r"\n" +NL_FIX = r" | " +SPACE_PIPE_CHAR_REGEX = r"\ \|\S" +SPACE_PIPE_CHAR_FIX = r" | " +PERCENT_REGEX = r"^(\d{1,3})%$" +POUND_REGEX = r"^(-)?£(\d+(\.\d{1,2})?)(\d+)?$" # handles negative numbers + + +class Cleanser: + """ + Takes a string, and cleans it. + + Doctests: + >>> t = "Text, with commas" + >>> c = Cleanser(t) + >>> c.clean() + 'Text with commas' + >>> a = "\'Text with leading apos." + >>> c = Cleanser(a) + >>> c.clean() + 'Text with leading apos.' + + """ + + def __init__(self, string): + self.string = string + + # a list of dicts that describe everything needed to fix errors in + # string passed to class constructor. Method self.clean() runs through + # them, fixing each in turn. + self._checks = [ + dict( + c_type="emdash", + rule=ENDASH_REGEX, + fix=ENDASH_FIX, + func=self._endash, + count=0, + ), + dict( + c_type="commas", + rule=COMMA_REGEX, + fix=COMMA_FIX, + func=self._commas, + count=0, + ), + dict( + c_type="leading_apostrophe", + rule=APOS_REGEX, + fix=APOS_FIX, + func=self._apostrophe, + count=0, + ), + dict(c_type="newline", + rule=NL_REGEX, + fix=NL_FIX, + func=self._newline, + count=0), + dict( + c_type="double_space", + rule=" ", + fix=" ", + func=self._doublespace, + count=0, + ), + dict( + c_type="trailing_space", + rule=TRAILING_SPACE_REGEX, + fix=None, + func=self._trailingspace, + count=0, + ), + dict( + c_type="pipe_char", + rule=SPACE_PIPE_CHAR_REGEX, + fix=SPACE_PIPE_CHAR_FIX, + func=self._space_pipe_char, + count=0, + ), + dict(c_type="date", + rule=DATE_REGEX, + fix=None, + func=self._date, + count=0), + dict( + c_type="date_time", + rule=DATE_REGEX_TIME, + fix=None, + func=self._date_time, + count=0, + ), + dict(c_type="int", + rule=INT_REGEX, + fix=None, + func=self._int, + count=0), + dict(c_type="float", + rule=FLOAT_REGEX, + fix=None, + func=self._float, + count=0), + dict( + c_type="percent", + rule=PERCENT_REGEX, + fix=None, + func=self._percent, + count=0, + ), + dict(c_type="pound", + rule=POUND_REGEX, + fix=None, + func=self._pound, + count=0), + ] + self.checks_l = len(self._checks) + self._analyse() + + def _sort_checks(self): + """ + Sorts the list of dicts in self._checks by their count, highest + first, so that when the fix methods run down them, they always have + a count with a value higher than 0 to run with, otherwise later + fixes might not get hit. + """ + self._checks = sorted(self._checks, + key=itemgetter("count"), + reverse=True) + + def _endash(self, regex, fix): + """ + Turns – into -. + """ + return re.sub(regex, fix, self.string) + + def _pound(self, regex, fix): + """ + Turns £12.24 into 12.24 (a float). + """ + m = re.match(regex, self.string) + sum_p = m.group(2) + if m.group(1) == "-": + return float(sum_p) * -1 + else: + return float(sum_p) + + def _percent(self, regex, fix): + """ + Turns 100% into 1.0. + """ + m = re.match(regex, self.string) + p = int(m.group(1)) + return p / 100 + + def _float(self, regex, fix): + """ + Turns numbers that look like floats into floats. + """ + return float(self.string) + + def _int(self, regex, fix): + """ + Turns numbers that look like integers into integers. + """ + return int(self.string) + + def _date(self, regex, fix): + """ + Handles dates in "03/05/2016" format. + """ + m = re.match(regex, self.string) + if int(m.groups()[-1]) in range(1965, 1967): + logger.warning( + ("Dates inputted as dd/mm/65 will migrate as dd/mm/2065. " + "Dates inputted as dd/mm/66 will migrate as dd/mm/1966.")) + try: + if len(m.string.split("-")[0]) == 4: # year is first + return datetime.date( + int(m.string.split("-")[0]), + int(m.string.split("-")[1]), + int(m.string.split("-")[2]), + ) + else: + return parse(m.string, dayfirst=True).date() + except IndexError: + pass + except ValueError: + logger.warning( + 'Potential date issue (perhaps a date mixed with free text?): "{}"' + .format(self.string)) + return self.string + + def _date_time(self, regex, fix): + """ + Handles dates in "2017-05-01 0:00:00" format. We get this from the + csv file when we send it back out to templates/forms. Returns a Python + date object. + """ + m = re.match(regex, self.string) + year = int(m.group(1)) + month = int(m.group(3)) + day = int(m.group(5)) + try: + return date(year, month, day) + except ValueError: + logger.error("Incorrect date format {}!".format(self.string)) + return self.string + + def _commas(self, regex, fix): + """ + Handles commas in self.string according to rule in self._checks + """ + # we want to sort the list first so self._checks has any item + # with a count > 0 up front, otherwise if a count of 0 appears + # before it in the list, the > 0 count never gets fixed + return re.sub(regex, fix, self.string) + + def _apostrophe(self, regex, fix): + """Handles apostrophes as first char of the string.""" + return self.string.lstrip("'") + + def _newline(self, regex, fix): + """Handles newlines anywhere in string.""" + return re.sub(regex, fix, self.string) + + def _doublespace(self, regex, fix): + """Handles double-spaces anywhere in string.""" + return re.sub(regex, fix, self.string) + + def _trailingspace(self, regex, fix): + """Handles trailing space in the string.""" + return self.string.strip() + + def _space_pipe_char(self, regex, fix): + """Handles space pipe char anywhere in string.""" + return re.sub(regex, fix, self.string) + + def _access_checks(self, c_type): + """Helper method returns the index of rule in self._checks + when given a c_type""" + return self._checks.index( + next(item for item in self._checks if item["c_type"] == c_type)) + + def _analyse(self): + """ + Uses the self._checks table as a basis for counting the number of + each cleaning target required, and calling the appropriate method + to clean. + """ + i = 0 + while i < self.checks_l: + matches = re.finditer(self._checks[i]["rule"], self.string) + if matches: + self._checks[i]["count"] += len(list(matches)) + i += 1 + + def clean(self): + """Runs each applicable cleaning action and returns the cleaned + string.""" + self._sort_checks() + for check in self._checks: + if check["count"] > 0: + self.string = check["func"](check["rule"], check["fix"]) + check["count"] = 0 + else: + return self.string + return self.string diff --git a/datamaps/tests/__init__.py b/datamaps/tests/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/datamaps/tests/__init__.py diff --git a/datamaps/tests/conftest.py b/datamaps/tests/conftest.py new file mode 100644 index 0000000..22f20bf --- /dev/null +++ b/datamaps/tests/conftest.py @@ -0,0 +1,9 @@ +import os + +import pytest +from pathlib import Path + + +@pytest.fixture +def master() -> Path: + return Path.cwd() / "tests" / "resources" / "master.xlsx" diff --git a/datamaps/tests/resources/.~lock.master.xlsx# b/datamaps/tests/resources/.~lock.master.xlsx# new file mode 100644 index 0000000..7fb98fe --- /dev/null +++ b/datamaps/tests/resources/.~lock.master.xlsx# @@ -0,0 +1 @@ +,lemon,grafter.chutney.lan,27.09.2019 14:05,file:///home/lemon/.config/libreoffice/4;
\ No newline at end of file diff --git a/datamaps/tests/resources/master.xlsx b/datamaps/tests/resources/master.xlsx Binary files differnew file mode 100644 index 0000000..bf5adf0 --- /dev/null +++ b/datamaps/tests/resources/master.xlsx diff --git a/datamaps/tests/test_api.py b/datamaps/tests/test_api.py new file mode 100644 index 0000000..273f984 --- /dev/null +++ b/datamaps/tests/test_api.py @@ -0,0 +1,10 @@ +import datetime + +from ..api import project_data_from_master + + +def test_get_project_data(master): + master = project_data_from_master(master, 1, 2019) + assert master["Chutney Bridge.xlsm"]["Project/Programme Name"] == "Chutney Bridge Ltd" + assert master.quarter.quarter == 1 + assert master.quarter.end_date == datetime.date(2019, 6, 30) diff --git a/datamaps/tests/test_cleanser.py b/datamaps/tests/test_cleanser.py new file mode 100644 index 0000000..9d6df1a --- /dev/null +++ b/datamaps/tests/test_cleanser.py @@ -0,0 +1,41 @@ +import datetime +from ..process.cleansers import Cleanser + + +def test_cleaning_dot_date(): + ds = "25.1.72" + ds_double = "25.01.72" + four_year = "25.01.1972" + c = Cleanser(ds) + c_double = Cleanser(ds_double) + c_four = Cleanser(four_year) + assert c.clean() == datetime.date(1972, 1, 25) + assert c_double.clean() == datetime.date(1972, 1, 25) + assert c_four.clean() == datetime.date(1972, 1, 25) + + +def test_cleaning_slash_date(): + ds = "25/1/72" + ds_double = "25/01/72" + four_year = "25/01/1972" + c = Cleanser(ds) + c_double = Cleanser(ds_double) + c_four = Cleanser(four_year) + assert c.clean() == datetime.date(1972, 1, 25) + assert c_double.clean() == datetime.date(1972, 1, 25) + assert c_four.clean() == datetime.date(1972, 1, 25) + + +def test_em_dash_key(): + contains_em_dash = 'Pre 14-15 BL – Income both Revenue and Capital' + c = Cleanser(contains_em_dash) + assert c.clean() == 'Pre 14-15 BL - Income both Revenue and Capital' + + +def test_double_trailing_space(): + contains_double_trailing = 'Pre 14-15 BL - Incoming both Revenue and Capital ' + contains_single_trailing = 'Pre 14-15 BL - Incoming both Revenue and Capital ' + c = Cleanser(contains_double_trailing) + assert c.clean() == 'Pre 14-15 BL - Incoming both Revenue and Capital' + c = Cleanser(contains_single_trailing) + assert c.clean() == 'Pre 14-15 BL - Incoming both Revenue and Capital' diff --git a/datamaps/tests/test_quarter.py b/datamaps/tests/test_quarter.py new file mode 100644 index 0000000..c5b1824 --- /dev/null +++ b/datamaps/tests/test_quarter.py @@ -0,0 +1,52 @@ +import datetime + +import pytest + +from ..core import Quarter + + +def test_initialisation(): + q = Quarter(1, 2017) + assert q.start_date == datetime.date(2017, 4, 1) + assert q.end_date == datetime.date(2017, 6, 30) + q = Quarter(2, 2017) + assert q.start_date == datetime.date(2017, 7, 1) + assert q.end_date == datetime.date(2017, 9, 30) + q = Quarter(4, 2017) + assert q.start_date == datetime.date(2018, 1, 1) + assert q.end_date == datetime.date(2018, 3, 31) + + +def test_desc_string(): + assert str(Quarter(1, 2013)) == "Q1 13/14" + assert str(Quarter(2, 2013)) == "Q2 13/14" + assert str(Quarter(3, 2013)) == "Q3 13/14" + assert str(Quarter(4, 2013)) == "Q4 13/14" + + assert str(Quarter(1, 1998)) == "Q1 98/99" + assert str(Quarter(2, 1998)) == "Q2 98/99" + assert str(Quarter(3, 1998)) == "Q3 98/99" + assert str(Quarter(4, 1998)) == "Q4 98/99" + + assert str(Quarter(1, 1999)) == "Q1 99/00" + assert str(Quarter(2, 1999)) == "Q2 99/00" + assert str(Quarter(3, 1999)) == "Q3 99/00" + assert str(Quarter(4, 1999)) == "Q4 99/00" + + +def test_errors(): + with pytest.raises(ValueError) as excinfo: + Quarter(5, 2017) + assert "A quarter must be either 1, 2, 3 or 4" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + Quarter(3, 1921) + assert "Year must be between 1950 and 2100 - surely that will do?" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + Quarter("3", 2016) + assert "A quarter must be either 1, 2, 3 or 4" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + Quarter(3, "1921") + assert "Year must be between 1950 and 2100 - surely that will do?" in str(excinfo.value) diff --git a/datamaps/utils.py b/datamaps/utils.py new file mode 100644 index 0000000..950369e --- /dev/null +++ b/datamaps/utils.py @@ -0,0 +1,564 @@ +import configparser +import csv +import fnmatch +import logging +import os +import sys +from collections import OrderedDict +from datetime import date, datetime +from math import isclose + +from openpyxl import Workbook, load_workbook +from openpyxl.styles import PatternFill +from openpyxl.utils import quote_sheetname + +from .process.cleansers import Cleanser + +logger = logging.getLogger("bcompiler.utils") + +rdel_cdel_merge = "" + +DOCS = os.path.join(os.path.expanduser("~"), "Documents") +BCOMPILER_WORKING_D = "bcompiler" +ROOT_PATH = os.path.join(DOCS, BCOMPILER_WORKING_D) +SOURCE_DIR = os.path.join(ROOT_PATH, "source") + +CONFIG_FILE = os.path.join(SOURCE_DIR, "config.ini") + +runtime_config = configparser.ConfigParser() +runtime_config.read(CONFIG_FILE) + +CURRENT_QUARTER = runtime_config["QuarterData"]["CurrentQuarter"] + +try: + SHEETS = [ + i for i in dict((runtime_config.items("TemplateSheets"))).values() + ] + BLANK_TEMPLATE_FN = runtime_config["BlankTemplate"]["name"] +except configparser.NoSectionError: + print( + "There is no config file present. Please run bcompiler-init to initialise bcompiler" + ) + sys.exit() + + +def directory_has_returns_check(dir: str): + if os.listdir(dir) == []: + logger.critical( + "Please copy populated return files to returns directory.") + return False + else: + return True + + +def row_check(excel_file: str): + wb = load_workbook(excel_file) + data = [] + for sheet in wb.sheetnames: + ws = wb[sheet] + rows = ws.rows + data.append( + dict( + workbook=excel_file.split("/")[-1], + sheet=sheet, + row_count=len(list(rows)), + )) + return data + + +def row_data_formatter(csv_output=False, quiet=False) -> None: + """ + Prints counts of rows in each sheet in each return spreadsheet. + :param: csv_output - provide True to write output to csv file in output + directory. + :param: quiet - output differing row counts only. Cannot be used with + csv_output argument. + """ + if csv_output and quiet: + logger.critical("Cannot use --csv and --quiet option. Choose one" + " or the other.") + return + try: + returns_dir = os.path.join(ROOT_PATH, "source", "returns") + except FileNotFoundError: + logger.warning("There is no output directory. Run bcompiler -d to " + "set up working directories") + try: + tmpl_data = row_check( + os.path.join(ROOT_PATH, "source", BLANK_TEMPLATE_FN)) + except FileNotFoundError: + logger.warning("bicc_template.xlsm not found") + if csv_output: + csv_output_path = os.path.join(OUTPUT_DIR, "row_count.csv") + csv_output_file = open(csv_output_path, "w", newline="") + csv_writer = csv.writer(csv_output_file) + logger.info("Writing output to csv file...") + elif quiet: + logger.info("Looking for anomolies in row counts in each sheet...") + else: + print("{0:<90}{1:<40}{2:<10}".format("Workbook", "Sheet", "Row Count")) + print("{:#<150}".format("")) + + # Start with the bicc_template.xlsm BASE data + for line in tmpl_data: + if csv_output: + csv_writer.writerow( + [line["workbook"], line["sheet"], line["row_count"]]) + elif quiet: + pass + else: + print( + f"{line['workbook']:<90}{line['sheet']:<40}{line['row_count']:<10}" + ) + print("{:#<150}".format("")) + for f in os.listdir(returns_dir): + if fnmatch.fnmatch(f, "*.xlsm"): + d = row_check(os.path.join(returns_dir, f)) + zipped_data = zip(tmpl_data, d) + for line in zipped_data: + counts = [i["row_count"] for i in line] + flag = counts[0] != counts[-1] + if not flag: + if csv_output: + csv_writer.writerow([ + line[1]["workbook"], + line[1]["sheet"], + line[1]["row_count"], + ]) + elif quiet: + pass + else: + print( + f"{line[1]['workbook']:<90}{line[1]['sheet']:<40}{line[1]['row_count']:<10}" + ) + else: + if csv_output: + csv_writer.writerow([ + line[1]["workbook"], + line[1]["sheet"], + line[1]["row_count"], + "INCONSISTENT WITH bicc_template.xlsm", + ]) + else: + print( + f"{line[1]['workbook']:<90}{line[1]['sheet']:<40}{line[1]['row_count']:<10} *" + ) + if not quiet: + print("{:#<150}".format("")) + else: + print(".") + else: + logger.critical(f"{f} does not have .xlsm file extension.") + if csv_output: + print(f"csv output file available at {csv_output_path}") + csv_output_file.close() + + +def quick_typechecker(*args): + """ + Very simple function to filter allowed types (int, float). Any other type + returns False. All arguments must be of same type. + """ + for arg in args: + if isinstance(arg, (int, float, date)): + pass + else: + return False + return True + + +def simple_round(fl, prec): + """Rounds a fl to prec precision.""" + return round(fl, prec) + + +def bc_is_close(x, y): + """Returns true if acceptably close.""" + if isinstance(x, (date, datetime)) or isinstance(y, (date, datetime)): + return False + else: + return isclose(x, y, rel_tol=0.001) + + +def cell_bg_colour(rgb=[]): + """ + Give it a list of integers between 0 and 255 - three of them. + """ + c_value = "{0:02X}{1:02X}{2:02X}".format(*rgb) + return PatternFill(patternType="solid", fgColor=c_value, bgColor=c_value) + + +def get_relevant_names(project_name, project_data): + + try: + sro_first_name = project_data[project_name]["SRO Full Name"].split( + " ")[0] + except IndexError: + logger.warning( + "SRO Full Name ({0}) is not suitable for splitting".format( + project_data[project_name]["SRO Full Name"])) + + try: + sro_last_name = project_data[project_name]["SRO Full Name"].split( + " ")[1] + except IndexError: + logger.warning( + "SRO Full Name ({0}) is not suitable for splitting".format( + project_data[project_name]["SRO Full Name"])) + + try: + pd_first_name = project_data[project_name]["PD Full Name"].split( + " ")[0] + except IndexError: + logger.warning( + "PD Full Name ({0}) is not suitable for splitting".format( + project_data[project_name]["PD Full Name"])) + + try: + pd_last_name = project_data[project_name]["PD Full Name"].split(" ")[1] + except IndexError: + logger.warning( + "PD Full Name ({0}) is not suitable for splitting".format( + project_data[project_name]["PD Full Name"])) + + try: + sro_d = dict(first_name=sro_first_name, last_name=sro_last_name) + except UnboundLocalError: + sro_d = None + try: + pd_d = dict(first_name=pd_first_name, last_name=pd_last_name) + except UnboundLocalError: + pd_d = None + + return (sro_d, pd_d) + + +def project_data_from_master(master_file: str, opened_wb=False): + if opened_wb is False: + wb = load_workbook(master_file) + ws = wb.active + else: + wb = master_file + ws = wb.active + # cleanse the keys + for cell in ws["A"]: + # we don't want to clean None... + if cell.value is None: + continue + c = Cleanser(cell.value) + cell.value = c.clean() + p_dict = {} + for col in ws.iter_cols(min_col=2): + project_name = "" + o = OrderedDict() + for cell in col: + if cell.row == 1: + project_name = cell.value + p_dict[project_name] = o + else: + val = ws.cell(row=cell.row, column=1).value + if type(cell.value) == datetime: + d_value = date(cell.value.year, cell.value.month, + cell.value.day) + p_dict[project_name][val] = d_value + else: + p_dict[project_name][val] = cell.value + # remove any "None" projects that were pulled from the master + try: + del p_dict[None] + except KeyError: + pass + return p_dict + + +def project_data_line(): + p_dict = {} + with open(SOURCE_DIR + "master_transposed.csv", "r") as f: + reader = csv.DictReader(f) + for row in reader: + key = row.pop("Project/Programme Name") + if key in p_dict: + pass + p_dict[key] = row + logger.debug( + "Adding {} to project_data_line dictionary".format(key)) + return p_dict + + +def open_openpyxl_template(template_file): + """ + Opens an xlsx file (the template) and returns the openpyxl object. + """ + wb = load_workbook(template_file, keep_vba=True) + logger.info("Opening {} as an openpyxl object".format(template_file)) + return wb + + +def working_directory(dir_type=None): + """ + Returns the working directory for source files + :return: path to the working directory intended for the source files + """ + docs = os.path.join(os.path.expanduser("~"), "Documents") + bcomp_working_d = "bcompiler" + try: + root_path = os.path.join(docs, bcomp_working_d) + except FileNotFoundError: + print("You need to run with --create-wd to", + "create the working directory") + if dir_type == "source": + return root_path + "/source/" + elif dir_type == "output": + return root_path + "/output/" + elif dir_type == "returns": + return root_path + "/source/returns/" + else: + return + + +# TODO this lot needs cleaning up - no more use of working_directory() + +SOURCE_DIR = working_directory("source") +OUTPUT_DIR = working_directory("output") +RETURNS_DIR = working_directory("returns") +DATAMAP_RETURN_TO_MASTER = SOURCE_DIR + "datamap.csv" +DATAMAP_MASTER_TO_RETURN = SOURCE_DIR + "datamap.csv" +DATAMAP_MASTER_TO_GMPP = SOURCE_DIR + "archive/datamap-master-to-gmpp" +CLEANED_DATAMAP = SOURCE_DIR + "cleaned_datamap.csv" +MASTER = SOURCE_DIR + "master.csv" +TEMPLATE = SOURCE_DIR + BLANK_TEMPLATE_FN +GMPP_TEMPLATE = SOURCE_DIR + "archive/gmpp_template.xlsx" + + +def index_returns_directory(): + """ + Prior to compiling a master, it is useful to get the order of projects + by their file name, as the compile.run() function traverses the directory + top to bottom to build the master. We can then use this to compare with the + order or projects (columns) in the old master document we are comparing + the current compile. This is pretty hackish but needs must... + """ + target_files = [] + for f in os.listdir(RETURNS_DIR): + target_files.append(f) + + pnames_in_returns_dir = [] + for f in target_files: + if fnmatch.fnmatch(f, "*.xlsm"): + wb = load_workbook(os.path.join(RETURNS_DIR, f)) + ws = wb[runtime_config["TemplateSheets"]["summary_sheet"]] + pnames_in_returns_dir.append(ws["B5"].value) + return pnames_in_returns_dir + + +def splat_rows(row): + yield [(c.value, c.row, c.column) for c in row] + + +def parse_csv_to_file(source_file): + """ + Transposes the master to a new master_transposed.csv file. + :param source_file: + :return: + """ + output = open(SOURCE_DIR + "master_transposed.csv", "w+") + try: + source = open(source_file, "r") + except FileNotFoundError: + logger.critical(f"There is no file {source_file} present.") + source.close() + return + with open(source_file, "r") as source_f: + lis = [x.split(",") for x in source_f] + for i in lis: + # we need to do this to remove trailing "\n" from the end of + # each original master.csv line + i[-1] = i[-1].rstrip() + + for x in zip(*lis): + for y in x: + output.write(y + ",") + output.write("\n") + output.close() + + +def create_master_dict_transposed(source_master_csv): + """ + The side-effect of the following function is to ensure there is a + 'master_transposed.csv' file present in SOURCE_DIR + returns a list of dicts, which makes up all the data from the master + """ + parse_csv_to_file(source_master_csv) + with open(SOURCE_DIR + "master_transposed.csv", "r") as f: + r = csv.DictReader(f) + ls = [row for row in r] + return ls + + +sheet_name = "Dropdown" + +VALIDATION_REFERENCES = { + "Quarter": + "{0}!$A$2:$A$9".format(quote_sheetname(sheet_name)), + "Joining Qtr": + "{0}!$B$2:$B$25".format(quote_sheetname(sheet_name)), + "Classification": + "{0}!$C$2:$C$4".format(quote_sheetname(sheet_name)), + "Entity format": + "{0}!$D$2:$D$4".format(quote_sheetname(sheet_name)), + "Methodology": + "{0}!$E$2:$E$10".format(quote_sheetname(sheet_name)), + "Category": + "{0}!$F$2:$H$11".format(quote_sheetname(sheet_name)), + "Scope Changed": + "{0}!$G$2:$I$4".format(quote_sheetname(sheet_name)), + "Monetised / Non Monetised Benefits": + "{0}!$H$2:$H$4".format(quote_sheetname(sheet_name)), + "RAG": + "{0}!$I$2:$I$6".format(quote_sheetname(sheet_name)), + "RAG 2": + "{0}!$J$2:$J$4".format(quote_sheetname(sheet_name)), + "RPA level": + "{0}!$K$2:$K$4".format(quote_sheetname(sheet_name)), + "Capability RAG": + "{0}!$L$2:$L$5".format(quote_sheetname(sheet_name)), + "MPLA / PLP": + "{0}!$M$2:$M$30".format(quote_sheetname(sheet_name)), + "PL Changes": + "{0}!$N$2:$N$31".format(quote_sheetname(sheet_name)), + "Stage": + "{0}!$O$2:$O$10".format(quote_sheetname(sheet_name)), + "Business Cases": + "{0}!$P$2:$P$11".format(quote_sheetname(sheet_name)), + "Milestone Types": + "{0}!$Q$2:$Q$4".format(quote_sheetname(sheet_name)), + "Finance figures format": + "{0}!$R$2:$R$3".format(quote_sheetname(sheet_name)), + "Index Years": + "{0}!$S$2:$S$27".format(quote_sheetname(sheet_name)), + "Discount Rate": + "{0}!$T$2:$T$32".format(quote_sheetname(sheet_name)), + "Finance type": + "{0}!$U$2:$U$6".format(quote_sheetname(sheet_name)), + "Yes/No": + "{0}!$V$2:$V$3".format(quote_sheetname(sheet_name)), + "Years (Spend)": + "{0}!$W$2:$W$90".format(quote_sheetname(sheet_name)), + "Years (Benefits)": + "{0}!$X$2:$X$90".format(quote_sheetname(sheet_name)), + "Snapshot Dates": + "{0}!$Y$2:$Y$9".format(quote_sheetname(sheet_name)), + "Percentage of time spent on SRO role": + "{0}!$Z$2:$Z$21".format(quote_sheetname(sheet_name)), + "AR Category": + "{0}!$AA$2:$AA$5".format(quote_sheetname(sheet_name)), + "Project Lifecycle": + "{0}!$AB$2:$AB$6".format(quote_sheetname(sheet_name)), + "Programme Lifecycle": + "{0}!$AC$2:$AC$7".format(quote_sheetname(sheet_name)), + "Other": + "{0}!$AD$2:$AD$19".format(quote_sheetname(sheet_name)), + "Start / Year end - FY": + "{0}!$AE$3:$AE$22".format(quote_sheetname(sheet_name)), + "Count": + "{0}!$AF$2:$AF$22".format(quote_sheetname(sheet_name)), + "VFM": + "{0}!$AG$2:$AG$11".format(quote_sheetname(sheet_name)), + "DfT Group": + "{0}!$AH$2:$AH$7".format(quote_sheetname(sheet_name)), + "DfT Division": + "{0}!$AI$2:$AI$15".format(quote_sheetname(sheet_name)), + "Agency": + "{0}!$AJ$2:$AJ$9".format(quote_sheetname(sheet_name)), + "High Speed Rail": + "{0}!$AK$2:$AK$4".format(quote_sheetname(sheet_name)), + "Rail Group": + "{0}!$AL$2:$AL$4".format(quote_sheetname(sheet_name)), + "Roads, Devolution & Motoring": + "{0}!$AM$2:$AM$5".format(quote_sheetname(sheet_name)), + "International, Security and Environment": + "{0}!$AN$2:$AN$4".format(quote_sheetname(sheet_name)), + "Resource and Strategy": + "{0}!$AO$2:$AO$2".format(quote_sheetname(sheet_name)), + "Non-Group": + "{0}!$AP$2:$AP$2".format(quote_sheetname(sheet_name)), + "GMPP Annual Report Category": + "{0}!$AQ$2:$AQ$2".format(quote_sheetname(sheet_name)), + "SDP": + "{0}!$AR2:$AR$5".format(quote_sheetname(sheet_name)), +} + + +def row_accessor(row: tuple): + """ + Utility generator yielding tuple of form (str, str); e.g + ('A10', 'Project/Programme Name'). + :param row: + :return: + """ + for item in row: + yield ("".join([item.column, str(item.row)]), item.value) + + +def gen_sheet_data(workbook: str) -> dict: + """ + Returns a dict containing data from a given xlsx file, by sheet + within that workbook. + :param path to xlsx file: + :return: dict of data by sheet in workbook + """ + wb = load_workbook(workbook) + sheets = wb._sheets + data = {} + for s in sheets: + rows = s.rows + title = s.title + data[title] = [list(row_accessor(x)) for x in rows] + return data + + +def parse_data_row(row: list) -> tuple: + """ + Utility generator which processes two-item tuples in a list. + :param row: + :return: tuple of form (str, str); e.g. ('A10', 'Project/Programme Name') + """ + for item in row: + yield item[0], item[1] + + +def get_sheets_in_workbook(real_template: str) -> list: + """ + Utility function to return a list of sheet names from an xlsx file. + :param real_template: + :return: list of sheet names + """ + wb = load_workbook(real_template) + sheets = wb._sheets + return sheets + + +def generate_test_template_from_real(real_template: str, + save_path: str) -> None: + """ + Given the bicc_template.xlsm file, this function strips it of + everything but cell data. + :param real_template: str path of location of bicc_template.xlsm + :param save_path: str path of output directory; file will be named 'gen_bicc_template.xlsm', + of the form "~/Documents" + :return: + """ + data = gen_sheet_data(real_template) + sheets = get_sheets_in_workbook(real_template) + blank = Workbook() + sheet_order = 0 + for sheet in sheets: + summary_sheet = blank.create_sheet(sheet.title, sheet_order) + for row in data[sheet.title]: + r = parse_data_row(row) + for cell in r: + summary_sheet[cell[0]] = cell[1] + sheet_order += 1 + if save_path.endswith("/"): + save_path = save_path[:-1] + blank.save("".join([save_path, "/gen_bicc_template.xlsm"])) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e4e8b5e --- /dev/null +++ b/setup.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +import os + +from setuptools import find_packages, setup + +import datamaps + + +def read(*names): + values = dict() + extensions = ['.txt', '.rst'] + for name in names: + value = '' + for extension in extensions: + filename = name + extension + if os.path.isfile(filename): + value = open(name + extension).read() + break + values[name] = value + return values + + +long_description = """ +%(README)s + +News +==== + +%(CHANGES)s + +""" % read('README', 'CHANGES') + +setup( + name='datamaps', + version=datamaps.__version__, + description='Collect and clean data using Excel spreadsheets.', + long_description=long_description, + classifiers=[ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Other Audience", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Topic :: Text Processing", + ], + keywords='data console commandline excel', + author='Matthew Lemon', + author_email='matt@matthewlemon.com', + maintainer='Matthew Lemon', + maintainer_email='matt@matthewlemon.com', + url='https://github.com/hammerheadlemon/datamaps', + packages=find_packages(), + python_requires='>=3.6', + entry_points={'console_scripts': [ + 'datamaps = datamaps.main:cli' + ]}, + setup_requires=['wheel'], + install_requires=[ + 'click', + 'colorlog', + 'python-dateutil', + 'bcompiler-engine @ https://github.com/hammerheadlemon/bcompiler-engine/archive/master.zip#egg=bcompiler-engine' + ], + test_suite='datamaps.tests') |