aboutsummaryrefslogblamecommitdiffstats
path: root/datamaps/core/master.py
blob: 1fbfe902c746d6d37f465934327759b452ff01b6 (plain) (tree)















































































































































































































                                                                                                                                                                                                         
import re
import datetime
import logging
import unicodedata
from pathlib import Path
from typing import List, Tuple, Iterable, Optional, Any

from ..utils import project_data_from_master
from ..process.cleansers import DATE_REGEX_4
from .temporal import Quarter

from openpyxl import load_workbook

logger = logging.getLogger('bcompiler.utils')


class ProjectData:
    """
    ProjectData class
    """
    def __init__(self, d: dict) -> None:
        """
        :py:func:`OrderedDict` is easiest to get from project_data_from_master[x]
        """
        self._data = d

    def __len__(self) -> int:
        return len(self._data)

    def __getitem__(self, item):
        return self._data[item]

    def key_filter(self, key: str) -> List[Tuple]:
        """
        Return a list of (k, v) tuples if k in master key.
        """
        data = [item for item in self._data.items() if key in item[0]]
        if not data:
            raise KeyError("Sorry, there is no matching data")
        return (data)

    def pull_keys(self, input_iter: Iterable, flat=False) -> List[Tuple[Any, ...]]:
        """
        Returns a list of (key, value) tuples from ProjectData if key matches a
        key. The order of tuples is based on the order of keys passed in the iterable.
        """
        if flat is True:
            # search and replace troublesome EN DASH character
            xs = [item for item in self._data.items()
                  for i in input_iter if item[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i]
            xs = [_convert_str_date_to_object(x) for x in xs]
            ts = sorted(xs, key=lambda x: input_iter.index(x[0].strip().replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS'))))
            ts = [item[1] for item in ts]
            return ts
        else:
            xs = [item for item in self._data.items()
                  for i in input_iter if item[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS')) == i]
            xs = [item for item in self._data.items()
                  for i in input_iter if item[0] == i]
            xs = [_convert_str_date_to_object(x) for x in xs]
            ts = sorted(xs, key=lambda x: input_iter.index(x[0].replace(unicodedata.lookup('EN DASH'), unicodedata.lookup('HYPHEN-MINUS'))))
            return ts

    def __repr__(self):
        return f"ProjectData() - with data: {id(self._data)}"


def _convert_str_date_to_object(d_str: tuple) -> Tuple[str, Optional[datetime.date]]:
    try:
        if re.match(DATE_REGEX_4, d_str[1]):
            try:
                ds = d_str[1].split('-')
                return (d_str[0], datetime.date(int(ds[0]), int(ds[1]), int(ds[2])))
            except TypeError:
                return d_str
        else:
            return d_str
    except TypeError:
        return d_str


class Master:
    """A Master object, representing the main central data item in ``bcompiler``.

    Args:
        quarter (:py:class:`bcompiler.api.Quarter`): creating using ``Quarter(1, 2017)`` for example.
        path (str): path to the master xlsx file

    A master object is a composition between a :py:class:`bcompiler.api.Quarter` object and an
    actual master xlsx file on disk.

    You create one, either by creating the Quarter object first, and using that as the first
    parameter of the ``Master`` constructor, e.g.::

        from bcompiler.api import Quarter
        from bcompiler.api import Master

        q1 = Quarter(1, 2016)
        m1 = Master(q1, '/tmp/master_1_2016.xlsx')

    or by doing both in one::

        m1 = Master(Quarter(1, 2016), '/tmp/master_1_2016.xlsx')

    Once you have a ``Master`` object, you can access project data from it, like this::

        project_data = m1['Project Title']


    The following *attributes* are available on `m1` once created as such, e.g.::

        data = m1.data
        quarter = m1.quarter
        filename = m1.filename
        ..etc
    """
    def __init__(self, quarter: Quarter, path: str) -> None:
        self._quarter = quarter
        self.path = path
        self._data = project_data_from_master(self.path)
        self._project_titles = [item for item in self.data.keys()]
        self.year = self._quarter.year

    def __getitem__(self, project_name):
        return ProjectData(self._data[project_name])

    @property
    def data(self):
        """Return all the data contained in the master in a large, nested dictionary.

        The resulting data structure contains a dictionary of :py:class:`colletions.OrderedDict` items whose
        key is the name of a project::

            "Project Name": OrderedDict("key": "value"
                                        ...)

        This object can then be further interrogated, for example to obtain all key/values
        from a partictular project, by doing::

            d = Master.data
            project_data = d['PROJECT_NAME']

        """
        return self._data

    @property
    def quarter(self):
        """Returns the ``Quarter`` object associated with the ``Master``.

        Example::

            q1 = m.quarter

        ``q1`` can then be further interrogated as documented in :py:class:`core.temporal.Quarter`.

        """

        return self._quarter

    @property
    def filename(self):
        """The filename of the master xlsx file, e.g. ``master_1_2017.xlsx``.
        """
        p = Path(self.path)
        return p.name

    @property
    def projects(self):
        """A list of project titles derived from the master xlsx.
        """
        return self._project_titles

    def duplicate_keys(self, to_log=None):
        """Checks for duplicate keys in a master xlsx file.

        Args:
            to_log (bool): Optional True or False, depending on whether you want to see duplicates reported in a ``WARNING`` log message. This is used mainly for internal purposes within ``bcompiler``.

        Returns:
            duplicates (set): a set of duplicated keys
        """
        wb = load_workbook(self.path)
        ws = wb.active
        col_a = next(ws.iter_cols())
        col_a = [item.value for item in col_a]
        seen: set = set()
        uniq = []
        dups: set = set()
        for x in col_a:
            if x not in seen:
                uniq.append(x)
                seen.add(x)
            else:
                dups.add(x)
        if to_log and len(dups) > 0:
            for x in dups:
                logger.warning(f"{self.path} contains duplicate key: \"{x}\". Masters cannot contain duplicate keys. Rename them.")
            return True
        elif to_log and len(dups) == 0:
            logger.info(f"No duplicate keys in {self.path}")
            return False
        elif len(dups) > 0:
            return dups
        else:
            return False

    def __repr__(self):
        return f"Master({self.path}, {self.quarter.quarter}, {self.quarter.year})"