import re import datetime import logging import unicodedata from pathlib import Path from typing import List, Tuple, Iterable, Optional, Any from datamaps.plugins.dft.portfolio import project_data_from_master from datamaps.process.cleansers import DATE_REGEX_4 from datamaps.core.temporal import Quarter from openpyxl import load_workbook logger = logging.getLogger("bcompiler.utils") class ProjectData: """ ProjectData class """ def __init__(self, d: dict) -> None: """ :py:func:`OrderedDict` is easiest to get from project_data_from_master[x] """ self._data = d def __len__(self) -> int: return len(self._data) def __getitem__(self, item): return self._data[item] def key_filter(self, key: str) -> List[Tuple]: """ Return a list of (k, v) tuples if k in master key. """ data = [item for item in self._data.items() if key in item[0]] if not data: raise KeyError("Sorry, there is no matching data") return data def pull_keys(self, input_iter: Iterable, flat=False) -> List[Tuple[Any, ...]]: """ Returns a list of (key, value) tuples from ProjectData if key matches a key. The order of tuples is based on the order of keys passed in the iterable. """ if flat is True: # search and replace troublesome EN DASH character xs = [ item for item in self._data.items() for i in input_iter if item[0] .strip() .replace( unicodedata.lookup("EN DASH"), unicodedata.lookup("HYPHEN-MINUS") ) == i ] xs = [_convert_str_date_to_object(x) for x in xs] ts = sorted( xs, key=lambda x: input_iter.index( x[0] .strip() .replace( unicodedata.lookup("EN DASH"), unicodedata.lookup("HYPHEN-MINUS"), ) ), ) ts = [item[1] for item in ts] return ts else: xs = [ item for item in self._data.items() for i in input_iter if item[0].replace( unicodedata.lookup("EN DASH"), unicodedata.lookup("HYPHEN-MINUS") ) == i ] xs = [ item for item in self._data.items() for i in input_iter if item[0] == i ] xs = [_convert_str_date_to_object(x) for x in xs] ts = sorted( xs, key=lambda x: input_iter.index( x[0].replace( unicodedata.lookup("EN DASH"), unicodedata.lookup("HYPHEN-MINUS"), ) ), ) return ts def __repr__(self): return f"ProjectData() - with data: {id(self._data)}" def _convert_str_date_to_object(d_str: tuple) -> Tuple[str, Optional[datetime.date]]: try: if re.match(DATE_REGEX_4, d_str[1]): try: ds = d_str[1].split("-") return (d_str[0], datetime.date(int(ds[0]), int(ds[1]), int(ds[2]))) except TypeError: return d_str else: return d_str except TypeError: return d_str class Master: """A Master object, representing the main central data item in ``datamaps``. Args: quarter (:py:class:`bcompiler.api.Quarter`): creating using ``Quarter(1, 2017)`` for example. path (str): path to the master xlsx file A master object is a composition between a :py:class:`datamaps.api.Quarter` object and an actual master xlsx file on disk. You create one, either by creating the Quarter object first, and using that as the first parameter of the ``Master`` constructor, e.g.:: from bcompiler.api import Quarter from bcompiler.api import Master q1 = Quarter(1, 2016) m1 = Master(q1, '/tmp/master_1_2016.xlsx') or by doing both in one:: m1 = Master(Quarter(1, 2016), '/tmp/master_1_2016.xlsx') Once you have a ``Master`` object, you can access project data from it, like this:: project_data = m1['Project Title'] The following *attributes* are available on `m1` once created as such, e.g.:: data = m1.data quarter = m1.quarter filename = m1.filename ..etc """ def __init__( self, quarter: Quarter, path: str, declared_month: Optional[int] = None ) -> None: self._quarter = quarter self._declared_month = declared_month self.path = path self._data = project_data_from_master(self.path) self._project_titles = [item for item in self.data.keys()] self.year = self._quarter.year def __getitem__(self, project_name): return ProjectData(self._data[project_name]) @property def data(self): """Return all the data contained in the master in a large, nested dictionary. The resulting data structure contains a dictionary of :py:class:`colletions.OrderedDict` items whose key is the name of a project:: "Project Name": OrderedDict("key": "value" ...) This object can then be further interrogated, for example to obtain all key/values from a partictular project, by doing:: d = Master.data project_data = d['PROJECT_NAME'] """ return self._data @property def quarter(self): """Returns the ``Quarter`` object associated with the ``Master``. Example:: q1 = m.quarter ``q1`` can then be further interrogated as documented in :py:class:`core.temporal.Quarter`. """ return self._quarter @property def month(self): """ Returns the ``Month`` object associated with the ``Master``. """ months = { 1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December", } return [ m.month for m in self.quarter.months if m.month == months[self._declared_month] ][0] @property def filename(self): """The filename of the master xlsx file, e.g. ``master_1_2017.xlsx``.""" p = Path(self.path) return p.name @property def projects(self): """A list of project titles derived from the master xlsx.""" return self._project_titles def duplicate_keys(self, to_log=None): """Checks for duplicate keys in a master xlsx file. Args: to_log (bool): Optional True or False, depending on whether you want to see duplicates reported in a ``WARNING`` log message. This is used mainly for internal purposes within ``bcompiler``. Returns: duplicates (set): a set of duplicated keys """ wb = load_workbook(self.path) ws = wb.active col_a = next(ws.iter_cols()) col_a = [item.value for item in col_a] seen: set = set() uniq = [] dups: set = set() for x in col_a: if x not in seen: uniq.append(x) seen.add(x) else: dups.add(x) if to_log and len(dups) > 0: for x in dups: logger.warning( f'{self.path} contains duplicate key: "{x}". Masters cannot contain duplicate keys. Rename them.' ) return True elif to_log and len(dups) == 0: logger.info(f"No duplicate keys in {self.path}") return False elif len(dups) > 0: return dups else: return False def __repr__(self): return f"Master({self.path}, {self.quarter.quarter}, {self.quarter.year})"