aboutsummaryrefslogtreecommitdiffstats
path: root/datamaps/core/master.py
diff options
context:
space:
mode:
Diffstat (limited to 'datamaps/core/master.py')
-rw-r--r--datamaps/core/master.py208
1 files changed, 208 insertions, 0 deletions
diff --git a/datamaps/core/master.py b/datamaps/core/master.py
new file mode 100644
index 0000000..1fbfe90
--- /dev/null
+++ b/datamaps/core/master.py
@@ -0,0 +1,208 @@
+import re
+import datetime
+import logging
+import unicodedata
+from pathlib import Path
+from typing import List, Tuple, Iterable, Optional, Any
+
+from ..utils import project_data_from_master
+from ..process.cleansers import DATE_REGEX_4
+from .temporal import Quarter
+
+from openpyxl import load_workbook
+
+logger = logging.getLogger('bcompiler.utils')
+
+
+class ProjectData:
+ """
+ ProjectData class
+ """
+ def __init__(self, d: dict) -> None:
+ """
+ :py:func:`OrderedDict` is easiest to get from project_data_from_master[x]
+ """
+ self._data = d
+
+ def __len__(self) -> int:
+ return len(self._data)
+
+ def __getitem__(self, item):
+ return self._data[item]
+
+ def key_filter(self, key: str) -> List[Tuple]:
+ """
+ Return a list of (k, v) tuples if k in master key.
+ """
+ data = [item for item in self._data.items() if key in item[0]]
+ if not data:
+ raise KeyError("Sorry, there is no matching data")
+ return (data)
+
+ def pull_keys(self, input_iter: Iterable, flat=False) -> List[Tuple[Any, ...]]:
+ """
+ Returns a list of (key, value) tuples from ProjectData if key matches a
+ key. The order of tuples is based on the order of keys passed in the iterable.
+ """
+ if flat is True:
+ # search and replace troublesome EN DASH character
+ xs = [item for item in self._data.items()
+ for i in input_iter if item[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i]
+ xs = [_convert_str_date_to_object(x) for x in xs]
+ ts = sorted(xs, key=lambda x: input_iter.index(x[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS'))))
+ ts = [item[1] for item in ts]
+ return ts
+ else:
+ xs = [item for item in self._data.items()
+ for i in input_iter if item[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i]
+ xs = [item for item in self._data.items()
+ for i in input_iter if item[0] == i]
+ xs = [_convert_str_date_to_object(x) for x in xs]
+ ts = sorted(xs, key=lambda x: input_iter.index(x[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS'))))
+ return ts
+
+ def __repr__(self):
+ return f"ProjectData() - with data: {id(self._data)}"
+
+
+def _convert_str_date_to_object(d_str: tuple) -> Tuple[str, Optional[datetime.date]]:
+ try:
+ if re.match(DATE_REGEX_4, d_str[1]):
+ try:
+ ds = d_str[1].split('-')
+ return (d_str[0], datetime.date(int(ds[0]), int(ds[1]), int(ds[2])))
+ except TypeError:
+ return d_str
+ else:
+ return d_str
+ except TypeError:
+ return d_str
+
+
+class Master:
+ """A Master object, representing the main central data item in ``bcompiler``.
+
+ Args:
+ quarter (:py:class:`bcompiler.api.Quarter`): creating using ``Quarter(1, 2017)`` for example.
+ path (str): path to the master xlsx file
+
+ A master object is a composition between a :py:class:`bcompiler.api.Quarter` object and an
+ actual master xlsx file on disk.
+
+ You create one, either by creating the Quarter object first, and using that as the first
+ parameter of the ``Master`` constructor, e.g.::
+
+ from bcompiler.api import Quarter
+ from bcompiler.api import Master
+
+ q1 = Quarter(1, 2016)
+ m1 = Master(q1, '/tmp/master_1_2016.xlsx')
+
+ or by doing both in one::
+
+ m1 = Master(Quarter(1, 2016), '/tmp/master_1_2016.xlsx')
+
+ Once you have a ``Master`` object, you can access project data from it, like this::
+
+ project_data = m1['Project Title']
+
+
+ The following *attributes* are available on `m1` once created as such, e.g.::
+
+ data = m1.data
+ quarter = m1.quarter
+ filename = m1.filename
+ ..etc
+ """
+ def __init__(self, quarter: Quarter, path: str) -> None:
+ self._quarter = quarter
+ self.path = path
+ self._data = project_data_from_master(self.path)
+ self._project_titles = [item for item in self.data.keys()]
+ self.year = self._quarter.year
+
+ def __getitem__(self, project_name):
+ return ProjectData(self._data[project_name])
+
+ @property
+ def data(self):
+ """Return all the data contained in the master in a large, nested dictionary.
+
+ The resulting data structure contains a dictionary of :py:class:`colletions.OrderedDict` items whose
+ key is the name of a project::
+
+ "Project Name": OrderedDict("key": "value"
+ ...)
+
+ This object can then be further interrogated, for example to obtain all key/values
+ from a partictular project, by doing::
+
+ d = Master.data
+ project_data = d['PROJECT_NAME']
+
+ """
+ return self._data
+
+ @property
+ def quarter(self):
+ """Returns the ``Quarter`` object associated with the ``Master``.
+
+ Example::
+
+ q1 = m.quarter
+
+ ``q1`` can then be further interrogated as documented in :py:class:`core.temporal.Quarter`.
+
+ """
+
+ return self._quarter
+
+ @property
+ def filename(self):
+ """The filename of the master xlsx file, e.g. ``master_1_2017.xlsx``.
+ """
+ p = Path(self.path)
+ return p.name
+
+ @property
+ def projects(self):
+ """A list of project titles derived from the master xlsx.
+ """
+ return self._project_titles
+
+ def duplicate_keys(self, to_log=None):
+ """Checks for duplicate keys in a master xlsx file.
+
+ Args:
+ to_log (bool): Optional True or False, depending on whether you want to see duplicates reported in a ``WARNING`` log message. This is used mainly for internal purposes within ``bcompiler``.
+
+ Returns:
+ duplicates (set): a set of duplicated keys
+ """
+ wb = load_workbook(self.path)
+ ws = wb.active
+ col_a = next(ws.iter_cols())
+ col_a = [item.value for item in col_a]
+ seen: set = set()
+ uniq = []
+ dups: set = set()
+ for x in col_a:
+ if x not in seen:
+ uniq.append(x)
+ seen.add(x)
+ else:
+ dups.add(x)
+ if to_log and len(dups) > 0:
+ for x in dups:
+ logger.warning(f"{self.path} contains duplicate key: \"{x}\". Masters cannot contain duplicate keys. Rename them.")
+ return True
+ elif to_log and len(dups) == 0:
+ logger.info(f"No duplicate keys in {self.path}")
+ return False
+ elif len(dups) > 0:
+ return dups
+ else:
+ return False
+
+ def __repr__(self):
+ return f"Master({self.path}, {self.quarter.quarter}, {self.quarter.year})"