initial commit

author: Matthew Lemon <lemon@matthewlemon.com> 2019-09-27 15:12:57 +0100
committer: Matthew Lemon <lemon@matthewlemon.com> 2019-09-27 15:12:57 +0100
commit: d4272b28da76c2d1fe110d611ef08a445f11ceb2 (patch)
tree: af01f431bd39504bb2d9234c7c6a41879b95051f
23 files changed, 1818 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9f11b75
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.idea/
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..7b1b4b4
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,18 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pytest = "*"
+
+[packages]
+openpyxl = "*"
+bcompiler-engine = {editable = true,path = "/home/lemon/code/python/bcompiler-engine"}
+colorlog = "*"
+python-dateutil = "*"
+click = "*"
+datamaps = {editable = true,path = "."}
+
+[requires]
+python_version = "3.8"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..101c7c4
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,159 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "98952e013f654efa60ba726c96572096e1bdaa9f45d92d0bb65d9faf905bffa2"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.8"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "appdirs": {
+            "hashes": [
+                "sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92",
+                "sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e"
+            ],
+            "version": "==1.4.3"
+        },
+        "bcompiler-engine": {
+            "editable": true,
+            "path": "/home/lemon/code/python/bcompiler-engine"
+        },
+        "click": {
+            "hashes": [
+                "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
+                "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
+            ],
+            "index": "pypi",
+            "version": "==7.0"
+        },
+        "colorlog": {
+            "hashes": [
+                "sha256:3cf31b25cbc8f86ec01fef582ef3b840950dea414084ed19ab922c8b493f9b42",
+                "sha256:450f52ea2a2b6ebb308f034ea9a9b15cea51e65650593dca1da3eb792e4e4981"
+            ],
+            "index": "pypi",
+            "version": "==4.0.2"
+        },
+        "datamaps": {
+            "editable": true,
+            "path": "."
+        },
+        "et-xmlfile": {
+            "hashes": [
+                "sha256:614d9722d572f6246302c4491846d2c393c199cfa4edc9af593437691683335b"
+            ],
+            "version": "==1.0.1"
+        },
+        "jdcal": {
+            "hashes": [
+                "sha256:1abf1305fce18b4e8aa248cf8fe0c56ce2032392bc64bbd61b5dff2a19ec8bba",
+                "sha256:472872e096eb8df219c23f2689fc336668bdb43d194094b5cc1707e1640acfc8"
+            ],
+            "version": "==1.4.1"
+        },
+        "openpyxl": {
+            "hashes": [
+                "sha256:340a1ab2069764559b9d58027a43a24db18db0e25deb80f81ecb8ca7ee5253db"
+            ],
+            "index": "pypi",
+            "version": "==3.0.0"
+        },
+        "python-dateutil": {
+            "hashes": [
+                "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
+                "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+            ],
+            "index": "pypi",
+            "version": "==2.8.0"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        }
+    },
+    "develop": {
+        "atomicwrites": {
+            "hashes": [
+                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+            ],
+            "version": "==1.3.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
+                "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+            ],
+            "version": "==19.1.0"
+        },
+        "more-itertools": {
+            "hashes": [
+                "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832",
+                "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4"
+            ],
+            "version": "==7.2.0"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47",
+                "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108"
+            ],
+            "version": "==19.2"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6",
+                "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34"
+            ],
+            "version": "==0.13.0"
+        },
+        "py": {
+            "hashes": [
+                "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
+                "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+            ],
+            "version": "==1.8.0"
+        },
+        "pyparsing": {
+            "hashes": [
+                "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80",
+                "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4"
+            ],
+            "version": "==2.4.2"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:813b99704b22c7d377bbd756ebe56c35252bb710937b46f207100e843440b3c2",
+                "sha256:cc6620b96bc667a0c8d4fa592a8c9c94178a1bd6cc799dbb057dfd9286d31a31"
+            ],
+            "index": "pypi",
+            "version": "==5.1.3"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "wcwidth": {
+            "hashes": [
+                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+            ],
+            "version": "==0.1.7"
+        }
+    }
+}
diff --git a/datamaps/__init__.py b/datamaps/__init__.py
new file mode 100644
index 0000000..c9bd737
--- /dev/null
+++ b/datamaps/__init__.py
@@ -0,0 +1 @@
+__version__ = "1.0.0b0"
diff --git a/datamaps/api/__init__.py b/datamaps/api/__init__.py
new file mode 100644
index 0000000..bb48fdb
--- /dev/null
+++ b/datamaps/api/__init__.py
@@ -0,0 +1,5 @@
+from .api import project_data_from_master_api as project_data_from_master
+from ..core.master import Master as Master
+from ..core.row import Row as Row
+from ..core.temporal import FinancialYear as FinancialYear
+from ..core.temporal import Quarter as Quarter
diff --git a/datamaps/api/api.py b/datamaps/api/api.py
new file mode 100644
index 0000000..59458e7
--- /dev/null
+++ b/datamaps/api/api.py
@@ -0,0 +1,14 @@
+from ..core import Master, Quarter
+
+
+def project_data_from_master_api(master_file: str, quarter: int, year: int):
+    """Create a Master object directly without the need to explicitly pass
+    a Quarter object.
+
+    Args:
+        master_file (str): the path to a master file
+        quarter (int): an integer representing the financial quarter
+        year (int): an integer representing the year
+    """
+    m = Master(Quarter(quarter, year), master_file)
+    return m
diff --git a/datamaps/core/__init__.py b/datamaps/core/__init__.py
new file mode 100644
index 0000000..305208d
--- /dev/null
+++ b/datamaps/core/__init__.py
@@ -0,0 +1,3 @@
+from .row import Row
+from .temporal import Quarter, FinancialYear
+from .master import Master, ProjectData
diff --git a/datamaps/core/master.py b/datamaps/core/master.py
new file mode 100644
index 0000000..1fbfe90
--- /dev/null
+++ b/datamaps/core/master.py
@@ -0,0 +1,208 @@
+import re
+import datetime
+import logging
+import unicodedata
+from pathlib import Path
+from typing import List, Tuple, Iterable, Optional, Any
+
+from ..utils import project_data_from_master
+from ..process.cleansers import DATE_REGEX_4
+from .temporal import Quarter
+
+from openpyxl import load_workbook
+
+logger = logging.getLogger('bcompiler.utils')
+
+
+class ProjectData:
+    """
+    ProjectData class
+    """
+    def __init__(self, d: dict) -> None:
+        """
+        :py:func:`OrderedDict` is easiest to get from project_data_from_master[x]
+        """
+        self._data = d
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __getitem__(self, item):
+        return self._data[item]
+
+    def key_filter(self, key: str) -> List[Tuple]:
+        """
+        Return a list of (k, v) tuples if k in master key.
+        """
+        data = [item for item in self._data.items() if key in item[0]]
+        if not data:
+            raise KeyError("Sorry, there is no matching data")
+        return (data)
+
+    def pull_keys(self, input_iter: Iterable, flat=False) -> List[Tuple[Any, ...]]:
+        """
+        Returns a list of (key, value) tuples from ProjectData if key matches a
+        key. The order of tuples is based on the order of keys passed in the iterable.
+        """
+        if flat is True:
+            # search and replace troublesome EN DASH character
+            xs = [item for item in self._data.items()
+                  for i in input_iter if item[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i]
+            xs = [_convert_str_date_to_object(x) for x in xs]
+            ts = sorted(xs, key=lambda x: input_iter.index(x[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS'))))
+            ts = [item[1] for item in ts]
+            return ts
+        else:
+            xs = [item for item in self._data.items()
+                  for i in input_iter if item[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i]
+            xs = [item for item in self._data.items()
+                  for i in input_iter if item[0] == i]
+            xs = [_convert_str_date_to_object(x) for x in xs]
+            ts = sorted(xs, key=lambda x: input_iter.index(x[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS'))))
+            return ts
+
+    def __repr__(self):
+        return f"ProjectData() - with data: {id(self._data)}"
+
+
+def _convert_str_date_to_object(d_str: tuple) -> Tuple[str, Optional[datetime.date]]:
+    try:
+        if re.match(DATE_REGEX_4, d_str[1]):
+            try:
+                ds = d_str[1].split('-')
+                return (d_str[0], datetime.date(int(ds[0]), int(ds[1]), int(ds[2])))
+            except TypeError:
+                return d_str
+        else:
+            return d_str
+    except TypeError:
+        return d_str
+
+
+class Master:
+    """A Master object, representing the main central data item in ``bcompiler``.
+
+    Args:
+        quarter (:py:class:`bcompiler.api.Quarter`): creating using ``Quarter(1, 2017)`` for example.
+        path (str): path to the master xlsx file
+
+    A master object is a composition between a :py:class:`bcompiler.api.Quarter` object and an
+    actual master xlsx file on disk.
+
+    You create one, either by creating the Quarter object first, and using that as the first
+    parameter of the ``Master`` constructor, e.g.::
+
+        from bcompiler.api import Quarter
+        from bcompiler.api import Master
+
+        q1 = Quarter(1, 2016)
+        m1 = Master(q1, '/tmp/master_1_2016.xlsx')
+
+    or by doing both in one::
+
+        m1 = Master(Quarter(1, 2016), '/tmp/master_1_2016.xlsx')
+
+    Once you have a ``Master`` object, you can access project data from it, like this::
+
+        project_data = m1['Project Title']
+
+
+    The following *attributes* are available on `m1` once created as such, e.g.::
+
+        data = m1.data
+        quarter = m1.quarter
+        filename = m1.filename
+        ..etc
+    """
+    def __init__(self, quarter: Quarter, path: str) -> None:
+        self._quarter = quarter
+        self.path = path
+        self._data = project_data_from_master(self.path)
+        self._project_titles = [item for item in self.data.keys()]
+        self.year = self._quarter.year
+
+    def __getitem__(self, project_name):
+        return ProjectData(self._data[project_name])
+
+    @property
+    def data(self):
+        """Return all the data contained in the master in a large, nested dictionary.
+
+        The resulting data structure contains a dictionary of :py:class:`colletions.OrderedDict` items whose
+        key is the name of a project::
+
+            "Project Name": OrderedDict("key": "value"
+                                        ...)
+
+        This object can then be further interrogated, for example to obtain all key/values
+        from a partictular project, by doing::
+
+            d = Master.data
+            project_data = d['PROJECT_NAME']
+
+        """
+        return self._data
+
+    @property
+    def quarter(self):
+        """Returns the ``Quarter`` object associated with the ``Master``.
+
+        Example::
+
+            q1 = m.quarter
+
+        ``q1`` can then be further interrogated as documented in :py:class:`core.temporal.Quarter`.
+
+        """
+
+        return self._quarter
+
+    @property
+    def filename(self):
+        """The filename of the master xlsx file, e.g. ``master_1_2017.xlsx``.
+        """
+        p = Path(self.path)
+        return p.name
+
+    @property
+    def projects(self):
+        """A list of project titles derived from the master xlsx.
+        """
+        return self._project_titles
+
+    def duplicate_keys(self, to_log=None):
+        """Checks for duplicate keys in a master xlsx file.
+
+        Args:
+            to_log (bool): Optional True or False, depending on whether you want to see duplicates reported in a ``WARNING`` log message. This is used mainly for internal purposes within ``bcompiler``.
+
+        Returns:
+            duplicates (set): a set of duplicated keys
+        """
+        wb = load_workbook(self.path)
+        ws = wb.active
+        col_a = next(ws.iter_cols())
+        col_a = [item.value for item in col_a]
+        seen: set = set()
+        uniq = []
+        dups: set = set()
+        for x in col_a:
+            if x not in seen:
+                uniq.append(x)
+                seen.add(x)
+            else:
+                dups.add(x)
+        if to_log and len(dups) > 0:
+            for x in dups:
+                logger.warning(f"{self.path} contains duplicate key: \"{x}\". Masters cannot contain duplicate keys. Rename them.")
+            return True
+        elif to_log and len(dups) == 0:
+            logger.info(f"No duplicate keys in {self.path}")
+            return False
+        elif len(dups) > 0:
+            return dups
+        else:
+            return False
+
+    def __repr__(self):
+        return f"Master({self.path}, {self.quarter.quarter}, {self.quarter.year})"
diff --git a/datamaps/core/row.py b/datamaps/core/row.py
new file mode 100644
index 0000000..470b753
--- /dev/null
+++ b/datamaps/core/row.py
@@ -0,0 +1,84 @@
+import string
+import datetime
+
+from ..process.cell import Cell
+from typing import Iterable
+
+from itertools import chain
+
+
+class Row:
+    """
+    A Row object is populated with an iterable (list or other sequence), bound
+    to an openpyxl worksheet. It is used to populate a row of cells in an output
+    Excel file with the values from the iterable.
+
+    The ``anchor_column`` and ``anchor_row`` parameters represent the coordinates of
+    a cell which form the *leftmost* cell of the row, i.e. to set the row of data
+    to start at the very top left of a sheet, you'd create the ``Row()`` object this::
+
+        r = Row(1, 1, interable)
+        r.bind(ws)
+    """
+
+    def __init__(self, anchor_column: int, anchor_row: int, seq: Iterable):
+        if isinstance(anchor_column, str):
+            if len(anchor_column) == 1:
+                enumerated_alphabet = list(enumerate(string.ascii_uppercase, start=1))
+                col_letter = [x for x in enumerated_alphabet if x[1] == anchor_column][0]
+                self._anchor_column = col_letter[0]
+                self._anchor_row = anchor_row
+                self._cell_map = []
+            elif len(anchor_column) == 2:
+                enumerated_alphabet = list(enumerate(list(chain(
+                    string.ascii_uppercase, ["{}{}".format(x[0], x[1]) for x in list(zip(['A'] * 26, string.ascii_uppercase))])), start=1))
+                col_letter = [x for x in enumerated_alphabet if x[1] == anchor_column][0]
+                self._anchor_column = col_letter[0]
+                self._anchor_row = anchor_row
+                self._cell_map = []
+            else:
+                raise ValueError("You can only have a column up to AZ")
+        else:
+            self._anchor_column = anchor_column
+            self._anchor_row = anchor_row
+            self._cell_map = []
+        self._seq = seq
+
+
+    def _basic_bind(self, ws):
+        for x in list(enumerate(self._seq, start=self._anchor_column)):
+            self._ws.cell(row=self._anchor_row, column=x[0], value=x[1])
+
+
+    def _cell_bind(self, ws):
+        self._cell_map = []
+        for x in list(enumerate(self._seq, start=self._anchor_column)):
+            self._cell_map.append(
+                Cell(
+                    cell_key="",
+                    cell_value=x[1],
+                    cell_reference=f"{self._anchor_column}{self._anchor_row}",
+                    template_sheet=ws,
+                    bg_colour=None,
+                    fg_colour=None,
+                    number_format=None,
+                    verification_list=None,
+                    r_idx=self._anchor_row,
+                    c_idx=x[0]
+                )
+            )
+        for c in self._cell_map:
+            if not isinstance(c.cell_value, datetime.date) and not None:
+                self._ws.cell(row=c.r_idx, column=c.c_idx, value=c.cell_value).number_format = '0'
+            else:
+                self._ws.cell(row=c.r_idx, column=c.c_idx, value=c.cell_value)
+
+
+
+    def bind(self, worksheet):
+        """Bind the Row to a particular worksheetl, which effectively does the
+        printing of data into cells. Must be done prior to saving the workbook.
+        """
+        self._ws = worksheet
+#       self._basic_bind(self._ws)
+        self._cell_bind(self._ws)
diff --git a/datamaps/core/temporal.py b/datamaps/core/temporal.py
new file mode 100644
index 0000000..ddb7004
--- /dev/null
+++ b/datamaps/core/temporal.py
@@ -0,0 +1,121 @@
+import datetime
+
+
+class FinancialYear:
+    """An object representing a financial year.
+
+    Used by ``bcompiler`` internally when creating :py:class:`bcompiler.api.Master` objects.
+    Can be used to calculate start and ends dates and :py:class:`bcompiler.api.Quarter` objects.
+
+    If parameter ``year`` must be in the range 150 - 2100.
+
+    """
+
+    def __init__(self, year):
+        if isinstance(year, int) and (year in range(150, 2100)):
+            self.year = year
+        else:
+            raise ValueError("A year must be an integer between 1950 and 2100")
+        self._generate_quarters()
+        self._q1 = self.quarters[0]
+        self._q2 = self.quarters[1]
+        self._q3 = self.quarters[2]
+        self._q4 = self.quarters[3]
+
+        self.start_date = self.q1.start_date
+        self.end_date = self.q4.end_date
+
+    @property
+    def q1(self) -> datetime.date:
+        """Quarter 1 as a :py:class:`datetime.date` object
+        """
+        return self._q1
+
+    @property
+    def q2(self):
+        """Quarter 2 as a :py:class:`datetime.date` object
+        """
+        return self._q2
+
+    @property
+    def q3(self):
+        """Quarter 3 as a :py:class:`datetime.date` object
+        """
+        return self._q3
+
+    @property
+    def q4(self):
+        """Quarter 4 as a :py:class:`datetime.date` object
+        """
+        return self._q4
+
+    def __str__(self):
+        return f"FY{str(self.year)}/{str(self.year + 1)[2:]}"
+
+    def _generate_quarters(self):
+        self.quarters = [Quarter(x, self.year) for x in range(1, 5)]
+
+
+    def __repr__(self):
+        return f"FinancialYear({self.year})"
+
+
+class Quarter:
+    """An object representing a financial quarter. This is mainly required for building
+    a :py:class:`core.master.Master` object.
+
+    Args:
+        quarter (int): e.g.1, 2, 3 or 4
+        year (int): e.g. 2013
+    """
+    _start_months = {
+        1: (4, 'April'),
+        2: (7, 'July'),
+        3: (10, 'October'),
+        4: (1, 'January')
+    }
+
+    _end_months = {
+        1: (6, 'June', 30),
+        2: (9, 'September', 30),
+        3: (12, 'December', 31),
+        4: (3, 'March', 31),
+    }
+
+
+    def __init__(self, quarter: int, year: int) -> None:
+
+        if isinstance(quarter, int) and (quarter >= 1 and quarter <= 4):
+            self.quarter = quarter
+        else:
+            raise ValueError("A quarter must be either 1, 2, 3 or 4")
+
+        if isinstance(year, int) and (year in range(1950, 2100)):
+            self.year = year
+        else:
+            raise ValueError("Year must be between 1950 and 2100 - surely that will do?")
+
+        self.start_date = self._start_date(self.quarter, self.year)
+        self.end_date = self._end_date(self.quarter, self.year)
+
+    def __str__(self):
+        return f"Q{self.quarter} {str(self.year)[2:]}/{str(self.year + 1)[2:]}"
+
+    def _start_date(self, q, y):
+        if q == 4:
+            y = y + 1
+        return datetime.date(y, Quarter._start_months[q][0], 1)
+
+    def _end_date(self, q, y):
+        if q == 4:
+            y = y + 1
+        return datetime.date(y, Quarter._end_months[q][0], Quarter._end_months[q][2])
+
+    def __repr__(self):
+        return f"Quarter({self.quarter}, {self.year})"
+
+    @property
+    def fy(self):
+        """Return a :py:class:`core.temporal.FinancialYear` object.
+        """
+        return FinancialYear(self.year)
diff --git a/datamaps/main.py b/datamaps/main.py
new file mode 100644
index 0000000..b2bd202
--- /dev/null
+++ b/datamaps/main.py
@@ -0,0 +1,131 @@
+"""
+Copyright (c) 2019 Matthew Lemon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy,  modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the  Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE. """
+import logging
+from functools import partial
+
+import click
+
+import colorlog
+from engine.adapters import cli as engine_cli
+from engine.config import Config as engine_config
+from engine.use_cases.parsing import MalFormedCSVHeaderException
+
+logger = colorlog.getLogger("bcompiler")
+logger.setLevel(logging.INFO)
+
+# we want to pass echo func down to bcompiler-engine
+output_funcs = dict(
+    click_echo_green=partial(click.secho, nl=False, fg="green"),
+    click_echo_yellow=partial(click.secho, nl=False, fg="yellow"),
+    click_echo_red=partial(click.secho, nl=False, fg="red"),
+    click_echo_white=partial(click.secho, nl=False, fg="white"),
+)
+
+
+class Config:
+    def __init__(self):
+        self.verbose = False
+
+
+pass_config = click.make_pass_decorator(Config, ensure=True)
+
+
+@click.group()
+@click.option("--verbose", is_flag=True)
+@pass_config
+def cli(config, verbose):
+    """
+    bcompiler is a tool for moving data to and from spreadsheets. See web site, etc.
+    """
+    config.verbose = verbose
+
+
+@cli.group("import")
+def _import():
+    """
+    Import something (a batch of populated templates, a datamap, etc).
+    """
+
+
+@cli.group("export")
+def export():
+    """
+    Export something (master data to blank templates, etc).
+    """
+
+
+@cli.group("report")
+def report():
+    """Create a report"""
+
+
+@_import.command()
+@click.option(
+    "--to-master",
+    "-m",
+    is_flag=True,
+    default=False,
+    help="Create master.xlsx immediately",
+)
+def templates(to_master):
+    engine_config.initialise()
+    click.secho("Hello from bcompiler 2.0!", fg="yellow")
+    if to_master:
+        try:
+            engine_cli.import_and_create_master(echo_funcs=output_funcs)
+        except MalFormedCSVHeaderException as e:
+            click.echo(
+                click.style("Incorrect headers in datamap. {}.".format(e.args[0]), bold=True, reverse=True, fg="cyan"))
+    else:
+        click.secho("Not implemented yet. Try --to-master/-m flag")
+
+
+@export.command()
+# @click.argument("datamap")
+# @click.argument("blank")
+@click.argument("master")
+def master(master):
+    engine_config.initialise()
+
+    input_dir = engine_config.PLATFORM_DOCS_DIR / "input"
+
+    blank_fn = engine_config.config_parser["DEFAULT"]["blank file name"]
+    datamap_fn = engine_config.config_parser["DEFAULT"]["datamap file name"]
+
+    blank = input_dir / blank_fn
+    datamap = input_dir / datamap_fn
+
+    click.secho(f"Exporting master {master} to templates based on {blank}...")
+
+    try:
+        engine_cli.write_master_to_templates(blank, datamap, master)
+    except FileNotFoundError as e:
+        click.secho(str(e), fg="red")
+
+
+@report.command()
+@click.argument("target_file")
+def data_validations(target_file):
+    """Requires the path to the target spreadsheet file."""
+    click.secho(f"Getting data validations from: {target_file}", fg="green")
+    engine_config.initialise()
+    report = engine_cli.report_data_validations_in_file(target_file)
+    for r in report:
+        click.secho(r)
diff --git a/datamaps/process/__init__.py b/datamaps/process/__init__.py
new file mode 100644
index 0000000..12a88fc
--- /dev/null
+++ b/datamaps/process/__init__.py
@@ -0,0 +1 @@
+from .cleansers import Cleanser
diff --git a/datamaps/process/cell.py b/datamaps/process/cell.py
new file mode 100644
index 0000000..edd964b
--- /dev/null
+++ b/datamaps/process/cell.py
@@ -0,0 +1,42 @@
+from typing import Any, Union
+
+
+class Cell:
+    """
+    Purpose of the Cell object is to hold data about a spreadsheet cell.
+    They are used to populate a datamap cell_map and to write out data to
+    a template.
+    """
+    def __init__(self,
+                 cell_key: str,
+                 cell_value: Any,
+                 cell_reference: str,
+                 template_sheet: str,
+                 bg_colour: Union[str, None],
+                 fg_colour: Union[str, None],
+                 number_format: Union[str, None],
+                 verification_list: Union[str, None],
+                 r_idx: int=None,
+                 c_idx: int=None
+                 ) -> None:
+        if cell_value:
+            self.cell_value = cell_value
+        else:
+            self.cell_value = None
+        self.cell_key = cell_key
+        self.cell_reference = cell_reference
+        self.template_sheet = template_sheet
+        self.bg_colour = bg_colour
+        self.fg_colour = fg_colour
+        self.number_format = number_format
+        self.verification_list = verification_list
+        self.r_idx = r_idx
+        self.c_idx = c_idx
+
+    def __repr__(self) -> str:
+        return ("<Cell: cell_key: {} cell_value: {} cell_reference: {} "
+                "template_sheet: {}>".format(
+                    self.cell_key,
+                    self.cell_value,
+                    self.cell_reference,
+                    self.template_sheet))
diff --git a/datamaps/process/cleansers.py b/datamaps/process/cleansers.py
new file mode 100644
index 0000000..d562f9c
--- /dev/null
+++ b/datamaps/process/cleansers.py
@@ -0,0 +1,287 @@
+import datetime
+import re
+from datetime import date
+from operator import itemgetter
+
+import colorlog
+from dateutil.parser import parse
+
+logger = colorlog.getLogger("bcompiler.cleanser")
+
+ENDASH_REGEX = r"–"
+ENDASH_FIX = r"-"
+COMMA_REGEX = r",\s?"
+COMMA_FIX = r" "
+APOS_REGEX = r"^'"
+APOS_FIX = r""
+TRAILING_SPACE_REGEX = r"(.+)( |  )$"
+DATE_REGEX = r"^(\d{2,4})(\/|-|\.)(\d{1,2})(\/|-|\.)(\d{2,4})"
+DATE_REGEX_4 = r"^(\d{2,4})(/|-|\.)(\d{1,2})(/|-|\.)(\d{1,2})"
+DATE_REGEX_TIME = r"^(\d{2,4})(/|-)(\d{1,2})(/|-)(\d{1,2})\s(0:00:00)"
+INT_REGEX = r"^[-+]?\d+$"
+FLOAT_REGEX = r"^[-+]?([0-9]*)\.[0-9]+$"
+NL_REGEX = r"\n"
+NL_FIX = r" | "
+SPACE_PIPE_CHAR_REGEX = r"\ \|\S"
+SPACE_PIPE_CHAR_FIX = r" | "
+PERCENT_REGEX = r"^(\d{1,3})%$"
+POUND_REGEX = r"^(-)?£(\d+(\.\d{1,2})?)(\d+)?$"  # handles negative numbers
+
+
+class Cleanser:
+    """
+    Takes a string, and cleans it.
+
+    Doctests:
+    >>> t = "Text, with commas"
+    >>> c = Cleanser(t)
+    >>> c.clean()
+    'Text with commas'
+    >>> a = "\'Text with leading apos."
+    >>> c = Cleanser(a)
+    >>> c.clean()
+    'Text with leading apos.'
+
+    """
+
+    def __init__(self, string):
+        self.string = string
+
+        # a list of dicts that describe everything needed to fix errors in
+        # string passed to class constructor. Method self.clean() runs through
+        # them,  fixing each in turn.
+        self._checks = [
+            dict(
+                c_type="emdash",
+                rule=ENDASH_REGEX,
+                fix=ENDASH_FIX,
+                func=self._endash,
+                count=0,
+            ),
+            dict(
+                c_type="commas",
+                rule=COMMA_REGEX,
+                fix=COMMA_FIX,
+                func=self._commas,
+                count=0,
+            ),
+            dict(
+                c_type="leading_apostrophe",
+                rule=APOS_REGEX,
+                fix=APOS_FIX,
+                func=self._apostrophe,
+                count=0,
+            ),
+            dict(c_type="newline",
+                 rule=NL_REGEX,
+                 fix=NL_FIX,
+                 func=self._newline,
+                 count=0),
+            dict(
+                c_type="double_space",
+                rule="  ",
+                fix=" ",
+                func=self._doublespace,
+                count=0,
+            ),
+            dict(
+                c_type="trailing_space",
+                rule=TRAILING_SPACE_REGEX,
+                fix=None,
+                func=self._trailingspace,
+                count=0,
+            ),
+            dict(
+                c_type="pipe_char",
+                rule=SPACE_PIPE_CHAR_REGEX,
+                fix=SPACE_PIPE_CHAR_FIX,
+                func=self._space_pipe_char,
+                count=0,
+            ),
+            dict(c_type="date",
+                 rule=DATE_REGEX,
+                 fix=None,
+                 func=self._date,
+                 count=0),
+            dict(
+                c_type="date_time",
+                rule=DATE_REGEX_TIME,
+                fix=None,
+                func=self._date_time,
+                count=0,
+            ),
+            dict(c_type="int",
+                 rule=INT_REGEX,
+                 fix=None,
+                 func=self._int,
+                 count=0),
+            dict(c_type="float",
+                 rule=FLOAT_REGEX,
+                 fix=None,
+                 func=self._float,
+                 count=0),
+            dict(
+                c_type="percent",
+                rule=PERCENT_REGEX,
+                fix=None,
+                func=self._percent,
+                count=0,
+            ),
+            dict(c_type="pound",
+                 rule=POUND_REGEX,
+                 fix=None,
+                 func=self._pound,
+                 count=0),
+        ]
+        self.checks_l = len(self._checks)
+        self._analyse()
+
+    def _sort_checks(self):
+        """
+        Sorts the list of dicts in self._checks by their count, highest
+        first, so that when the fix methods run down them, they always have
+        a count with a value higher than 0 to run with, otherwise later
+        fixes might not get hit.
+        """
+        self._checks = sorted(self._checks,
+                              key=itemgetter("count"),
+                              reverse=True)
+
+    def _endash(self, regex, fix):
+        """
+        Turns – into -.
+        """
+        return re.sub(regex, fix, self.string)
+
+    def _pound(self, regex, fix):
+        """
+        Turns £12.24 into 12.24 (a float).
+        """
+        m = re.match(regex, self.string)
+        sum_p = m.group(2)
+        if m.group(1) == "-":
+            return float(sum_p) * -1
+        else:
+            return float(sum_p)
+
+    def _percent(self, regex, fix):
+        """
+        Turns 100% into 1.0.
+        """
+        m = re.match(regex, self.string)
+        p = int(m.group(1))
+        return p / 100
+
+    def _float(self, regex, fix):
+        """
+        Turns numbers that look like floats into floats.
+        """
+        return float(self.string)
+
+    def _int(self, regex, fix):
+        """
+        Turns numbers that look like integers into integers.
+        """
+        return int(self.string)
+
+    def _date(self, regex, fix):
+        """
+        Handles dates in "03/05/2016" format.
+        """
+        m = re.match(regex, self.string)
+        if int(m.groups()[-1]) in range(1965, 1967):
+            logger.warning(
+                ("Dates inputted as dd/mm/65 will migrate as dd/mm/2065. "
+                 "Dates inputted as dd/mm/66 will migrate as dd/mm/1966."))
+        try:
+            if len(m.string.split("-")[0]) == 4:  #  year is first
+                return datetime.date(
+                    int(m.string.split("-")[0]),
+                    int(m.string.split("-")[1]),
+                    int(m.string.split("-")[2]),
+                )
+            else:
+                return parse(m.string, dayfirst=True).date()
+        except IndexError:
+            pass
+        except ValueError:
+            logger.warning(
+                'Potential date issue (perhaps a date mixed with free text?): "{}"'
+                .format(self.string))
+            return self.string
+
+    def _date_time(self, regex, fix):
+        """
+        Handles dates in "2017-05-01 0:00:00" format. We get this from the
+        csv file when we send it back out to templates/forms. Returns a Python
+        date object.
+        """
+        m = re.match(regex, self.string)
+        year = int(m.group(1))
+        month = int(m.group(3))
+        day = int(m.group(5))
+        try:
+            return date(year, month, day)
+        except ValueError:
+            logger.error("Incorrect date format {}!".format(self.string))
+            return self.string
+
+    def _commas(self, regex, fix):
+        """
+        Handles commas in self.string according to rule in self._checks
+        """
+        # we want to sort the list first so self._checks has any item
+        # with a count > 0 up front, otherwise if a count of 0 appears
+        # before it in the list, the > 0 count never gets fixed
+        return re.sub(regex, fix, self.string)
+
+    def _apostrophe(self, regex, fix):
+        """Handles apostrophes as first char of the string."""
+        return self.string.lstrip("'")
+
+    def _newline(self, regex, fix):
+        """Handles newlines anywhere in string."""
+        return re.sub(regex, fix, self.string)
+
+    def _doublespace(self, regex, fix):
+        """Handles double-spaces anywhere in string."""
+        return re.sub(regex, fix, self.string)
+
+    def _trailingspace(self, regex, fix):
+        """Handles trailing space in the string."""
+        return self.string.strip()
+
+    def _space_pipe_char(self, regex, fix):
+        """Handles space pipe char anywhere in string."""
+        return re.sub(regex, fix, self.string)
+
+    def _access_checks(self, c_type):
+        """Helper method returns the index of rule in self._checks
+        when given a c_type"""
+        return self._checks.index(
+            next(item for item in self._checks if item["c_type"] == c_type))
+
+    def _analyse(self):
+        """
+        Uses the self._checks table as a basis for counting the number of
+        each cleaning target required, and calling the appropriate method
+        to clean.
+        """
+        i = 0
+        while i < self.checks_l:
+            matches = re.finditer(self._checks[i]["rule"], self.string)
+            if matches:
+                self._checks[i]["count"] += len(list(matches))
+            i += 1
+
+    def clean(self):
+        """Runs each applicable cleaning action and returns the cleaned
+        string."""
+        self._sort_checks()
+        for check in self._checks:
+            if check["count"] > 0:
+                self.string = check["func"](check["rule"], check["fix"])
+                check["count"] = 0
+            else:
+                return self.string
+        return self.string
diff --git a/datamaps/tests/__init__.py b/datamaps/tests/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/datamaps/tests/__init__.py
diff --git a/datamaps/tests/conftest.py b/datamaps/tests/conftest.py
new file mode 100644
index 0000000..22f20bf
--- /dev/null
+++ b/datamaps/tests/conftest.py
@@ -0,0 +1,9 @@
+import os
+
+import pytest
+from pathlib import Path
+
+
+@pytest.fixture
+def master() -> Path:
+    return Path.cwd() / "tests" / "resources" / "master.xlsx"
diff --git a/datamaps/tests/resources/.~lock.master.xlsx# b/datamaps/tests/resources/.~lock.master.xlsx#
new file mode 100644
index 0000000..7fb98fe
--- /dev/null
+++ b/datamaps/tests/resources/.~lock.master.xlsx#
@@ -0,0 +1 @@
+,lemon,grafter.chutney.lan,27.09.2019 14:05,file:///home/lemon/.config/libreoffice/4;
+\ No newline at end of file
diff --git a/datamaps/tests/resources/master.xlsx b/datamaps/tests/resources/master.xlsx
new file mode 100644
index 0000000..bf5adf0
--- /dev/null
+++ b/datamaps/tests/resources/master.xlsx
diff --git a/datamaps/tests/test_api.py b/datamaps/tests/test_api.py
new file mode 100644
index 0000000..273f984
--- /dev/null
+++ b/datamaps/tests/test_api.py
@@ -0,0 +1,10 @@
+import datetime
+
+from ..api import project_data_from_master
+
+
+def test_get_project_data(master):
+    master = project_data_from_master(master, 1, 2019)
+    assert master["Chutney Bridge.xlsm"]["Project/Programme Name"] == "Chutney Bridge Ltd"
+    assert master.quarter.quarter == 1
+    assert master.quarter.end_date == datetime.date(2019, 6, 30)
diff --git a/datamaps/tests/test_cleanser.py b/datamaps/tests/test_cleanser.py
new file mode 100644
index 0000000..9d6df1a
--- /dev/null
+++ b/datamaps/tests/test_cleanser.py
@@ -0,0 +1,41 @@
+import datetime
+from ..process.cleansers import Cleanser
+
+
+def test_cleaning_dot_date():
+    ds = "25.1.72"
+    ds_double = "25.01.72"
+    four_year = "25.01.1972"
+    c = Cleanser(ds)
+    c_double = Cleanser(ds_double)
+    c_four = Cleanser(four_year)
+    assert c.clean() == datetime.date(1972, 1, 25)
+    assert c_double.clean() == datetime.date(1972, 1, 25)
+    assert c_four.clean() == datetime.date(1972, 1, 25)
+
+
+def test_cleaning_slash_date():
+    ds = "25/1/72"
+    ds_double = "25/01/72"
+    four_year = "25/01/1972"
+    c = Cleanser(ds)
+    c_double = Cleanser(ds_double)
+    c_four = Cleanser(four_year)
+    assert c.clean() == datetime.date(1972, 1, 25)
+    assert c_double.clean() == datetime.date(1972, 1, 25)
+    assert c_four.clean() == datetime.date(1972, 1, 25)
+
+
+def test_em_dash_key():
+    contains_em_dash = 'Pre 14-15 BL – Income both Revenue and Capital'
+    c = Cleanser(contains_em_dash)
+    assert c.clean() == 'Pre 14-15 BL - Income both Revenue and Capital'
+
+
+def test_double_trailing_space():
+    contains_double_trailing = 'Pre 14-15 BL - Incoming both Revenue and Capital  '
+    contains_single_trailing = 'Pre 14-15 BL - Incoming both Revenue and Capital '
+    c = Cleanser(contains_double_trailing)
+    assert c.clean() == 'Pre 14-15 BL - Incoming both Revenue and Capital'
+    c = Cleanser(contains_single_trailing)
+    assert c.clean() == 'Pre 14-15 BL - Incoming both Revenue and Capital'
diff --git a/datamaps/tests/test_quarter.py b/datamaps/tests/test_quarter.py
new file mode 100644
index 0000000..c5b1824
--- /dev/null
+++ b/datamaps/tests/test_quarter.py
@@ -0,0 +1,52 @@
+import datetime
+
+import pytest
+
+from ..core import Quarter
+
+
+def test_initialisation():
+    q = Quarter(1, 2017)
+    assert q.start_date == datetime.date(2017, 4, 1)
+    assert q.end_date == datetime.date(2017, 6, 30)
+    q = Quarter(2, 2017)
+    assert q.start_date == datetime.date(2017, 7, 1)
+    assert q.end_date == datetime.date(2017, 9, 30)
+    q = Quarter(4, 2017)
+    assert q.start_date == datetime.date(2018, 1, 1)
+    assert q.end_date == datetime.date(2018, 3, 31)
+
+
+def test_desc_string():
+    assert str(Quarter(1, 2013)) == "Q1 13/14"
+    assert str(Quarter(2, 2013)) == "Q2 13/14"
+    assert str(Quarter(3, 2013)) == "Q3 13/14"
+    assert str(Quarter(4, 2013)) == "Q4 13/14"
+
+    assert str(Quarter(1, 1998)) == "Q1 98/99"
+    assert str(Quarter(2, 1998)) == "Q2 98/99"
+    assert str(Quarter(3, 1998)) == "Q3 98/99"
+    assert str(Quarter(4, 1998)) == "Q4 98/99"
+
+    assert str(Quarter(1, 1999)) == "Q1 99/00"
+    assert str(Quarter(2, 1999)) == "Q2 99/00"
+    assert str(Quarter(3, 1999)) == "Q3 99/00"
+    assert str(Quarter(4, 1999)) == "Q4 99/00"
+
+
+def test_errors():
+    with pytest.raises(ValueError) as excinfo:
+        Quarter(5, 2017)
+    assert "A quarter must be either 1, 2, 3 or 4" in str(excinfo.value)
+
+    with pytest.raises(ValueError) as excinfo:
+        Quarter(3, 1921)
+    assert "Year must be between 1950 and 2100 - surely that will do?" in str(excinfo.value)
+
+    with pytest.raises(ValueError) as excinfo:
+        Quarter("3", 2016)
+    assert "A quarter must be either 1, 2, 3 or 4" in str(excinfo.value)
+
+    with pytest.raises(ValueError) as excinfo:
+        Quarter(3, "1921")
+    assert "Year must be between 1950 and 2100 - surely that will do?" in str(excinfo.value)
diff --git a/datamaps/utils.py b/datamaps/utils.py
new file mode 100644
index 0000000..950369e
--- /dev/null
+++ b/datamaps/utils.py
@@ -0,0 +1,564 @@
+import configparser
+import csv
+import fnmatch
+import logging
+import os
+import sys
+from collections import OrderedDict
+from datetime import date, datetime
+from math import isclose
+
+from openpyxl import Workbook, load_workbook
+from openpyxl.styles import PatternFill
+from openpyxl.utils import quote_sheetname
+
+from .process.cleansers import Cleanser
+
+logger = logging.getLogger("bcompiler.utils")
+
+rdel_cdel_merge = ""
+
+DOCS = os.path.join(os.path.expanduser("~"), "Documents")
+BCOMPILER_WORKING_D = "bcompiler"
+ROOT_PATH = os.path.join(DOCS, BCOMPILER_WORKING_D)
+SOURCE_DIR = os.path.join(ROOT_PATH, "source")
+
+CONFIG_FILE = os.path.join(SOURCE_DIR, "config.ini")
+
+runtime_config = configparser.ConfigParser()
+runtime_config.read(CONFIG_FILE)
+
+CURRENT_QUARTER = runtime_config["QuarterData"]["CurrentQuarter"]
+
+try:
+    SHEETS = [
+        i for i in dict((runtime_config.items("TemplateSheets"))).values()
+    ]
+    BLANK_TEMPLATE_FN = runtime_config["BlankTemplate"]["name"]
+except configparser.NoSectionError:
+    print(
+        "There is no config file present. Please run bcompiler-init to initialise bcompiler"
+    )
+    sys.exit()
+
+
+def directory_has_returns_check(dir: str):
+    if os.listdir(dir) == []:
+        logger.critical(
+            "Please copy populated return files to returns directory.")
+        return False
+    else:
+        return True
+
+
+def row_check(excel_file: str):
+    wb = load_workbook(excel_file)
+    data = []
+    for sheet in wb.sheetnames:
+        ws = wb[sheet]
+        rows = ws.rows
+        data.append(
+            dict(
+                workbook=excel_file.split("/")[-1],
+                sheet=sheet,
+                row_count=len(list(rows)),
+            ))
+    return data
+
+
+def row_data_formatter(csv_output=False, quiet=False) -> None:
+    """
+    Prints counts of rows in each sheet in each return spreadsheet.
+    :param: csv_output - provide True to write output to csv file in output
+    directory.
+    :param: quiet - output differing row counts only. Cannot be used with
+    csv_output argument.
+    """
+    if csv_output and quiet:
+        logger.critical("Cannot use --csv and --quiet option. Choose one"
+                        " or the other.")
+        return
+    try:
+        returns_dir = os.path.join(ROOT_PATH, "source", "returns")
+    except FileNotFoundError:
+        logger.warning("There is no output directory. Run bcompiler -d to "
+                       "set up working directories")
+    try:
+        tmpl_data = row_check(
+            os.path.join(ROOT_PATH, "source", BLANK_TEMPLATE_FN))
+    except FileNotFoundError:
+        logger.warning("bicc_template.xlsm not found")
+    if csv_output:
+        csv_output_path = os.path.join(OUTPUT_DIR, "row_count.csv")
+        csv_output_file = open(csv_output_path, "w", newline="")
+        csv_writer = csv.writer(csv_output_file)
+        logger.info("Writing output to csv file...")
+    elif quiet:
+        logger.info("Looking for anomolies in row counts in each sheet...")
+    else:
+        print("{0:<90}{1:<40}{2:<10}".format("Workbook", "Sheet", "Row Count"))
+        print("{:#<150}".format(""))
+
+    # Start with the bicc_template.xlsm BASE data
+    for line in tmpl_data:
+        if csv_output:
+            csv_writer.writerow(
+                [line["workbook"], line["sheet"], line["row_count"]])
+        elif quiet:
+            pass
+        else:
+            print(
+                f"{line['workbook']:<90}{line['sheet']:<40}{line['row_count']:<10}"
+            )
+    print("{:#<150}".format(""))
+    for f in os.listdir(returns_dir):
+        if fnmatch.fnmatch(f, "*.xlsm"):
+            d = row_check(os.path.join(returns_dir, f))
+            zipped_data = zip(tmpl_data, d)
+            for line in zipped_data:
+                counts = [i["row_count"] for i in line]
+                flag = counts[0] != counts[-1]
+                if not flag:
+                    if csv_output:
+                        csv_writer.writerow([
+                            line[1]["workbook"],
+                            line[1]["sheet"],
+                            line[1]["row_count"],
+                        ])
+                    elif quiet:
+                        pass
+                    else:
+                        print(
+                            f"{line[1]['workbook']:<90}{line[1]['sheet']:<40}{line[1]['row_count']:<10}"
+                        )
+                else:
+                    if csv_output:
+                        csv_writer.writerow([
+                            line[1]["workbook"],
+                            line[1]["sheet"],
+                            line[1]["row_count"],
+                            "INCONSISTENT WITH bicc_template.xlsm",
+                        ])
+                    else:
+                        print(
+                            f"{line[1]['workbook']:<90}{line[1]['sheet']:<40}{line[1]['row_count']:<10} *"
+                        )
+            if not quiet:
+                print("{:#<150}".format(""))
+            else:
+                print(".")
+        else:
+            logger.critical(f"{f} does not have .xlsm file extension.")
+    if csv_output:
+        print(f"csv output file available at {csv_output_path}")
+        csv_output_file.close()
+
+
+def quick_typechecker(*args):
+    """
+    Very simple function to filter allowed types (int, float). Any other type
+    returns False. All arguments must be of same type.
+    """
+    for arg in args:
+        if isinstance(arg, (int, float, date)):
+            pass
+        else:
+            return False
+    return True
+
+
+def simple_round(fl, prec):
+    """Rounds a fl to prec precision."""
+    return round(fl, prec)
+
+
+def bc_is_close(x, y):
+    """Returns true if acceptably close."""
+    if isinstance(x, (date, datetime)) or isinstance(y, (date, datetime)):
+        return False
+    else:
+        return isclose(x, y, rel_tol=0.001)
+
+
+def cell_bg_colour(rgb=[]):
+    """
+    Give it a list of integers between 0 and 255 - three of them.
+    """
+    c_value = "{0:02X}{1:02X}{2:02X}".format(*rgb)
+    return PatternFill(patternType="solid", fgColor=c_value, bgColor=c_value)
+
+
+def get_relevant_names(project_name, project_data):
+
+    try:
+        sro_first_name = project_data[project_name]["SRO Full Name"].split(
+            " ")[0]
+    except IndexError:
+        logger.warning(
+            "SRO Full Name ({0}) is not suitable for splitting".format(
+                project_data[project_name]["SRO Full Name"]))
+
+    try:
+        sro_last_name = project_data[project_name]["SRO Full Name"].split(
+            " ")[1]
+    except IndexError:
+        logger.warning(
+            "SRO Full Name ({0}) is not suitable for splitting".format(
+                project_data[project_name]["SRO Full Name"]))
+
+    try:
+        pd_first_name = project_data[project_name]["PD Full Name"].split(
+            " ")[0]
+    except IndexError:
+        logger.warning(
+            "PD Full Name ({0}) is not suitable for splitting".format(
+                project_data[project_name]["PD Full Name"]))
+
+    try:
+        pd_last_name = project_data[project_name]["PD Full Name"].split(" ")[1]
+    except IndexError:
+        logger.warning(
+            "PD Full Name ({0}) is not suitable for splitting".format(
+                project_data[project_name]["PD Full Name"]))
+
+    try:
+        sro_d = dict(first_name=sro_first_name, last_name=sro_last_name)
+    except UnboundLocalError:
+        sro_d = None
+    try:
+        pd_d = dict(first_name=pd_first_name, last_name=pd_last_name)
+    except UnboundLocalError:
+        pd_d = None
+
+    return (sro_d, pd_d)
+
+
+def project_data_from_master(master_file: str, opened_wb=False):
+    if opened_wb is False:
+        wb = load_workbook(master_file)
+        ws = wb.active
+    else:
+        wb = master_file
+        ws = wb.active
+    # cleanse the keys
+    for cell in ws["A"]:
+        # we don't want to clean None...
+        if cell.value is None:
+            continue
+        c = Cleanser(cell.value)
+        cell.value = c.clean()
+    p_dict = {}
+    for col in ws.iter_cols(min_col=2):
+        project_name = ""
+        o = OrderedDict()
+        for cell in col:
+            if cell.row == 1:
+                project_name = cell.value
+                p_dict[project_name] = o
+            else:
+                val = ws.cell(row=cell.row, column=1).value
+                if type(cell.value) == datetime:
+                    d_value = date(cell.value.year, cell.value.month,
+                                   cell.value.day)
+                    p_dict[project_name][val] = d_value
+                else:
+                    p_dict[project_name][val] = cell.value
+    # remove any "None" projects that were pulled from the master
+    try:
+        del p_dict[None]
+    except KeyError:
+        pass
+    return p_dict
+
+
+def project_data_line():
+    p_dict = {}
+    with open(SOURCE_DIR + "master_transposed.csv", "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            key = row.pop("Project/Programme Name")
+            if key in p_dict:
+                pass
+            p_dict[key] = row
+            logger.debug(
+                "Adding {} to project_data_line dictionary".format(key))
+    return p_dict
+
+
+def open_openpyxl_template(template_file):
+    """
+    Opens an xlsx file (the template) and returns the openpyxl object.
+    """
+    wb = load_workbook(template_file, keep_vba=True)
+    logger.info("Opening {} as an openpyxl object".format(template_file))
+    return wb
+
+
+def working_directory(dir_type=None):
+    """
+    Returns the working directory for source files
+    :return: path to the working directory intended for the source files
+    """
+    docs = os.path.join(os.path.expanduser("~"), "Documents")
+    bcomp_working_d = "bcompiler"
+    try:
+        root_path = os.path.join(docs, bcomp_working_d)
+    except FileNotFoundError:
+        print("You need to run with --create-wd to",
+              "create the working directory")
+    if dir_type == "source":
+        return root_path + "/source/"
+    elif dir_type == "output":
+        return root_path + "/output/"
+    elif dir_type == "returns":
+        return root_path + "/source/returns/"
+    else:
+        return
+
+
+# TODO this lot needs cleaning up - no more use of working_directory()
+
+SOURCE_DIR = working_directory("source")
+OUTPUT_DIR = working_directory("output")
+RETURNS_DIR = working_directory("returns")
+DATAMAP_RETURN_TO_MASTER = SOURCE_DIR + "datamap.csv"
+DATAMAP_MASTER_TO_RETURN = SOURCE_DIR + "datamap.csv"
+DATAMAP_MASTER_TO_GMPP = SOURCE_DIR + "archive/datamap-master-to-gmpp"
+CLEANED_DATAMAP = SOURCE_DIR + "cleaned_datamap.csv"
+MASTER = SOURCE_DIR + "master.csv"
+TEMPLATE = SOURCE_DIR + BLANK_TEMPLATE_FN
+GMPP_TEMPLATE = SOURCE_DIR + "archive/gmpp_template.xlsx"
+
+
+def index_returns_directory():
+    """
+    Prior to compiling a master, it is useful to get the order of projects
+    by their file name, as the compile.run() function traverses the directory
+    top to bottom to build the master. We can then use this to compare with the
+    order or projects (columns) in the old master document we are comparing
+    the current compile. This is pretty hackish but needs must...
+    """
+    target_files = []
+    for f in os.listdir(RETURNS_DIR):
+        target_files.append(f)
+
+    pnames_in_returns_dir = []
+    for f in target_files:
+        if fnmatch.fnmatch(f, "*.xlsm"):
+            wb = load_workbook(os.path.join(RETURNS_DIR, f))
+            ws = wb[runtime_config["TemplateSheets"]["summary_sheet"]]
+            pnames_in_returns_dir.append(ws["B5"].value)
+    return pnames_in_returns_dir
+
+
+def splat_rows(row):
+    yield [(c.value, c.row, c.column) for c in row]
+
+
+def parse_csv_to_file(source_file):
+    """
+    Transposes the master to a new master_transposed.csv file.
+    :param source_file:
+    :return:
+    """
+    output = open(SOURCE_DIR + "master_transposed.csv", "w+")
+    try:
+        source = open(source_file, "r")
+    except FileNotFoundError:
+        logger.critical(f"There is no file {source_file} present.")
+        source.close()
+        return
+    with open(source_file, "r") as source_f:
+        lis = [x.split(",") for x in source_f]
+        for i in lis:
+            # we need to do this to remove trailing "\n" from the end of
+            # each original master.csv line
+            i[-1] = i[-1].rstrip()
+
+    for x in zip(*lis):
+        for y in x:
+            output.write(y + ",")
+        output.write("\n")
+    output.close()
+
+
+def create_master_dict_transposed(source_master_csv):
+    """
+    The side-effect of the following function is to ensure there is a
+    'master_transposed.csv' file present in SOURCE_DIR
+    returns a list of dicts, which makes up all the data from the master
+    """
+    parse_csv_to_file(source_master_csv)
+    with open(SOURCE_DIR + "master_transposed.csv", "r") as f:
+        r = csv.DictReader(f)
+        ls = [row for row in r]
+    return ls
+
+
+sheet_name = "Dropdown"
+
+VALIDATION_REFERENCES = {
+    "Quarter":
+    "{0}!$A$2:$A$9".format(quote_sheetname(sheet_name)),
+    "Joining Qtr":
+    "{0}!$B$2:$B$25".format(quote_sheetname(sheet_name)),
+    "Classification":
+    "{0}!$C$2:$C$4".format(quote_sheetname(sheet_name)),
+    "Entity format":
+    "{0}!$D$2:$D$4".format(quote_sheetname(sheet_name)),
+    "Methodology":
+    "{0}!$E$2:$E$10".format(quote_sheetname(sheet_name)),
+    "Category":
+    "{0}!$F$2:$H$11".format(quote_sheetname(sheet_name)),
+    "Scope Changed":
+    "{0}!$G$2:$I$4".format(quote_sheetname(sheet_name)),
+    "Monetised / Non Monetised Benefits":
+    "{0}!$H$2:$H$4".format(quote_sheetname(sheet_name)),
+    "RAG":
+    "{0}!$I$2:$I$6".format(quote_sheetname(sheet_name)),
+    "RAG 2":
+    "{0}!$J$2:$J$4".format(quote_sheetname(sheet_name)),
+    "RPA level":
+    "{0}!$K$2:$K$4".format(quote_sheetname(sheet_name)),
+    "Capability RAG":
+    "{0}!$L$2:$L$5".format(quote_sheetname(sheet_name)),
+    "MPLA / PLP":
+    "{0}!$M$2:$M$30".format(quote_sheetname(sheet_name)),
+    "PL Changes":
+    "{0}!$N$2:$N$31".format(quote_sheetname(sheet_name)),
+    "Stage":
+    "{0}!$O$2:$O$10".format(quote_sheetname(sheet_name)),
+    "Business Cases":
+    "{0}!$P$2:$P$11".format(quote_sheetname(sheet_name)),
+    "Milestone Types":
+    "{0}!$Q$2:$Q$4".format(quote_sheetname(sheet_name)),
+    "Finance figures format":
+    "{0}!$R$2:$R$3".format(quote_sheetname(sheet_name)),
+    "Index Years":
+    "{0}!$S$2:$S$27".format(quote_sheetname(sheet_name)),
+    "Discount Rate":
+    "{0}!$T$2:$T$32".format(quote_sheetname(sheet_name)),
+    "Finance type":
+    "{0}!$U$2:$U$6".format(quote_sheetname(sheet_name)),
+    "Yes/No":
+    "{0}!$V$2:$V$3".format(quote_sheetname(sheet_name)),
+    "Years (Spend)":
+    "{0}!$W$2:$W$90".format(quote_sheetname(sheet_name)),
+    "Years (Benefits)":
+    "{0}!$X$2:$X$90".format(quote_sheetname(sheet_name)),
+    "Snapshot Dates":
+    "{0}!$Y$2:$Y$9".format(quote_sheetname(sheet_name)),
+    "Percentage of time spent on SRO role":
+    "{0}!$Z$2:$Z$21".format(quote_sheetname(sheet_name)),
+    "AR Category":
+    "{0}!$AA$2:$AA$5".format(quote_sheetname(sheet_name)),
+    "Project Lifecycle":
+    "{0}!$AB$2:$AB$6".format(quote_sheetname(sheet_name)),
+    "Programme Lifecycle":
+    "{0}!$AC$2:$AC$7".format(quote_sheetname(sheet_name)),
+    "Other":
+    "{0}!$AD$2:$AD$19".format(quote_sheetname(sheet_name)),
+    "Start / Year end - FY":
+    "{0}!$AE$3:$AE$22".format(quote_sheetname(sheet_name)),
+    "Count":
+    "{0}!$AF$2:$AF$22".format(quote_sheetname(sheet_name)),
+    "VFM":
+    "{0}!$AG$2:$AG$11".format(quote_sheetname(sheet_name)),
+    "DfT Group":
+    "{0}!$AH$2:$AH$7".format(quote_sheetname(sheet_name)),
+    "DfT Division":
+    "{0}!$AI$2:$AI$15".format(quote_sheetname(sheet_name)),
+    "Agency":
+    "{0}!$AJ$2:$AJ$9".format(quote_sheetname(sheet_name)),
+    "High Speed Rail":
+    "{0}!$AK$2:$AK$4".format(quote_sheetname(sheet_name)),
+    "Rail Group":
+    "{0}!$AL$2:$AL$4".format(quote_sheetname(sheet_name)),
+    "Roads, Devolution & Motoring":
+    "{0}!$AM$2:$AM$5".format(quote_sheetname(sheet_name)),
+    "International, Security and Environment":
+    "{0}!$AN$2:$AN$4".format(quote_sheetname(sheet_name)),
+    "Resource and Strategy":
+    "{0}!$AO$2:$AO$2".format(quote_sheetname(sheet_name)),
+    "Non-Group":
+    "{0}!$AP$2:$AP$2".format(quote_sheetname(sheet_name)),
+    "GMPP Annual Report Category":
+    "{0}!$AQ$2:$AQ$2".format(quote_sheetname(sheet_name)),
+    "SDP":
+    "{0}!$AR2:$AR$5".format(quote_sheetname(sheet_name)),
+}
+
+
+def row_accessor(row: tuple):
+    """
+    Utility generator yielding tuple of form (str, str); e.g
+    ('A10', 'Project/Programme Name').
+    :param row:
+    :return:
+    """
+    for item in row:
+        yield ("".join([item.column, str(item.row)]), item.value)
+
+
+def gen_sheet_data(workbook: str) -> dict:
+    """
+    Returns a dict containing data from a given xlsx file, by sheet
+    within that workbook.
+    :param path to xlsx file:
+    :return: dict of data by sheet in workbook
+    """
+    wb = load_workbook(workbook)
+    sheets = wb._sheets
+    data = {}
+    for s in sheets:
+        rows = s.rows
+        title = s.title
+        data[title] = [list(row_accessor(x)) for x in rows]
+    return data
+
+
+def parse_data_row(row: list) -> tuple:
+    """
+    Utility generator which processes two-item tuples in a list.
+    :param row:
+    :return: tuple of form (str, str); e.g. ('A10', 'Project/Programme Name')
+    """
+    for item in row:
+        yield item[0], item[1]
+
+
+def get_sheets_in_workbook(real_template: str) -> list:
+    """
+    Utility function to return a list of sheet names from an xlsx file.
+    :param real_template:
+    :return: list of sheet names
+    """
+    wb = load_workbook(real_template)
+    sheets = wb._sheets
+    return sheets
+
+
+def generate_test_template_from_real(real_template: str,
+                                     save_path: str) -> None:
+    """
+    Given the bicc_template.xlsm file, this function strips it of
+    everything but cell data.
+    :param real_template: str path of location of bicc_template.xlsm
+    :param save_path: str path of output directory; file will be named 'gen_bicc_template.xlsm',
+    of the form "~/Documents"
+    :return:
+    """
+    data = gen_sheet_data(real_template)
+    sheets = get_sheets_in_workbook(real_template)
+    blank = Workbook()
+    sheet_order = 0
+    for sheet in sheets:
+        summary_sheet = blank.create_sheet(sheet.title, sheet_order)
+        for row in data[sheet.title]:
+            r = parse_data_row(row)
+            for cell in r:
+                summary_sheet[cell[0]] = cell[1]
+        sheet_order += 1
+    if save_path.endswith("/"):
+        save_path = save_path[:-1]
+    blank.save("".join([save_path, "/gen_bicc_template.xlsm"]))
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e4e8b5e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+import os
+
+from setuptools import find_packages, setup
+
+import datamaps
+
+
+def read(*names):
+    values = dict()
+    extensions = ['.txt', '.rst']
+    for name in names:
+        value = ''
+        for extension in extensions:
+            filename = name + extension
+            if os.path.isfile(filename):
+                value = open(name + extension).read()
+                break
+        values[name] = value
+    return values
+
+
+long_description = """
+%(README)s
+
+News
+====
+
+%(CHANGES)s
+
+""" % read('README', 'CHANGES')
+
+setup(
+    name='datamaps',
+    version=datamaps.__version__,
+    description='Collect and clean data using Excel spreadsheets.',
+    long_description=long_description,
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Environment :: Console",
+        "Intended Audience :: Other Audience",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Topic :: Text Processing",
+    ],
+    keywords='data console commandline excel',
+    author='Matthew Lemon',
+    author_email='matt@matthewlemon.com',
+    maintainer='Matthew Lemon',
+    maintainer_email='matt@matthewlemon.com',
+    url='https://github.com/hammerheadlemon/datamaps',
+    packages=find_packages(),
+    python_requires='>=3.6',
+    entry_points={'console_scripts': [
+        'datamaps = datamaps.main:cli'
+    ]},
+    setup_requires=['wheel'],
+    install_requires=[
+        'click',
+        'colorlog',
+        'python-dateutil',
+        'bcompiler-engine @ https://github.com/hammerheadlemon/bcompiler-engine/archive/master.zip#egg=bcompiler-engine'
+    ],
+    test_suite='datamaps.tests')
author	Matthew Lemon <lemon@matthewlemon.com>	2019-09-27 15:12:57 +0100
committer	Matthew Lemon <lemon@matthewlemon.com>	2019-09-27 15:12:57 +0100
commit	d4272b28da76c2d1fe110d611ef08a445f11ceb2 (patch)
tree	af01f431bd39504bb2d9234c7c6a41879b95051f