aboutsummaryrefslogblamecommitdiffstats
path: root/datamaps/plugins/dft/master.py
blob: 04dfc19a8fd82c692865c28280634fe8b900bb49 (plain) (tree)
1
2
3
4
5
6
7
8
9
10






                                                       


                                                                   


                                  
                                             





                     
 


















                                                                                 
                   







                                                                                      










                                                                                     
                                                             










                                                           


                                         











                                                                                       
                                                             








                                                           









                                                                                     
                                        









                                                                                    
                                                                                




                                                                                                     
                                                                                             


























                                                                                            
 


                                                                               
                               
                                             









































                                                                                                            



                                                                    


















                                                      

             
                       
                                                                                




                           
                                                                    

























                                                                                                                                                                                                         


                                                                                                                     










                                                                                  
import re
import datetime
import logging
import unicodedata
from pathlib import Path
from typing import List, Tuple, Iterable, Optional, Any

from datamaps.plugins.dft.portfolio import project_data_from_master
from datamaps.process.cleansers import DATE_REGEX_4
from datamaps.core.temporal import Quarter

from openpyxl import load_workbook

logger = logging.getLogger("bcompiler.utils")


class ProjectData:
    """
    ProjectData class
    """

    def __init__(self, d: dict) -> None:
        """
        :py:func:`OrderedDict` is easiest to get from project_data_from_master[x]
        """
        self._data = d

    def __len__(self) -> int:
        return len(self._data)

    def __getitem__(self, item):
        return self._data[item]

    def key_filter(self, key: str) -> List[Tuple]:
        """
        Return a list of (k, v) tuples if k in master key.
        """
        data = [item for item in self._data.items() if key in item[0]]
        if not data:
            raise KeyError("Sorry, there is no matching data")
        return data

    def pull_keys(self, input_iter: Iterable, flat=False) -> List[Tuple[Any, ...]]:
        """
        Returns a list of (key, value) tuples from ProjectData if key matches a
        key. The order of tuples is based on the order of keys passed in the iterable.
        """
        if flat is True:
            # search and replace troublesome EN DASH character
            xs = [
                item
                for item in self._data.items()
                for i in input_iter
                if item[0]
                .strip()
                .replace(
                    unicodedata.lookup("EN DASH"), unicodedata.lookup("HYPHEN-MINUS")
                )
                == i
            ]
            xs = [_convert_str_date_to_object(x) for x in xs]
            ts = sorted(
                xs,
                key=lambda x: input_iter.index(
                    x[0]
                    .strip()
                    .replace(
                        unicodedata.lookup("EN DASH"),
                        unicodedata.lookup("HYPHEN-MINUS"),
                    )
                ),
            )
            ts = [item[1] for item in ts]
            return ts
        else:
            xs = [
                item
                for item in self._data.items()
                for i in input_iter
                if item[0].replace(
                    unicodedata.lookup("EN DASH"), unicodedata.lookup("HYPHEN-MINUS")
                )
                == i
            ]
            xs = [
                item for item in self._data.items() for i in input_iter if item[0] == i
            ]
            xs = [_convert_str_date_to_object(x) for x in xs]
            ts = sorted(
                xs,
                key=lambda x: input_iter.index(
                    x[0].replace(
                        unicodedata.lookup("EN DASH"),
                        unicodedata.lookup("HYPHEN-MINUS"),
                    )
                ),
            )
            return ts

    def __repr__(self):
        return f"ProjectData() - with data: {id(self._data)}"


def _convert_str_date_to_object(d_str: tuple) -> Tuple[str, Optional[datetime.date]]:
    try:
        if re.match(DATE_REGEX_4, d_str[1]):
            try:
                ds = d_str[1].split("-")
                return (d_str[0], datetime.date(int(ds[0]), int(ds[1]), int(ds[2])))
            except TypeError:
                return d_str
        else:
            return d_str
    except TypeError:
        return d_str


class Master:
    """A Master object, representing the main central data item in ``datamaps``.

    Args:
        quarter (:py:class:`bcompiler.api.Quarter`): creating using ``Quarter(1, 2017)`` for example.
        path (str): path to the master xlsx file

    A master object is a composition between a :py:class:`datamaps.api.Quarter` object and an
    actual master xlsx file on disk.

    You create one, either by creating the Quarter object first, and using that as the first
    parameter of the ``Master`` constructor, e.g.::

        from bcompiler.api import Quarter
        from bcompiler.api import Master

        q1 = Quarter(1, 2016)
        m1 = Master(q1, '/tmp/master_1_2016.xlsx')

    or by doing both in one::

        m1 = Master(Quarter(1, 2016), '/tmp/master_1_2016.xlsx')

    Once you have a ``Master`` object, you can access project data from it, like this::

        project_data = m1['Project Title']


    The following *attributes* are available on `m1` once created as such, e.g.::

        data = m1.data
        quarter = m1.quarter
        filename = m1.filename
        ..etc
    """

    def __init__(
        self, quarter: Quarter, path: str, declared_month: Optional[int] = None
    ) -> None:
        self._quarter = quarter
        self._declared_month = declared_month
        self.path = path
        self._data = project_data_from_master(self.path)
        self._project_titles = [item for item in self.data.keys()]
        self.year = self._quarter.year

    def __getitem__(self, project_name):
        return ProjectData(self._data[project_name])

    @property
    def data(self):
        """Return all the data contained in the master in a large, nested dictionary.

        The resulting data structure contains a dictionary of :py:class:`colletions.OrderedDict` items whose
        key is the name of a project::

            "Project Name": OrderedDict("key": "value"
                                        ...)

        This object can then be further interrogated, for example to obtain all key/values
        from a partictular project, by doing::

            d = Master.data
            project_data = d['PROJECT_NAME']

        """
        return self._data

    @property
    def quarter(self):
        """Returns the ``Quarter`` object associated with the ``Master``.

        Example::

            q1 = m.quarter

        ``q1`` can then be further interrogated as documented in :py:class:`core.temporal.Quarter`.

        """

        return self._quarter

    @property
    def month(self):
        """
        Returns the ``Month`` object associated with the ``Master``.
        """
        months = {
            1: "January",
            2: "February",
            3: "March",
            4: "April",
            5: "May",
            6: "June",
            7: "July",
            8: "August",
            9: "September",
            10: "October",
            11: "November",
            12: "December",
        }
        return [
            m.month
            for m in self.quarter.months
            if m.month == months[self._declared_month]
        ][0]

    @property
    def filename(self):
        """The filename of the master xlsx file, e.g. ``master_1_2017.xlsx``."""
        p = Path(self.path)
        return p.name

    @property
    def projects(self):
        """A list of project titles derived from the master xlsx."""
        return self._project_titles

    def duplicate_keys(self, to_log=None):
        """Checks for duplicate keys in a master xlsx file.

        Args:
            to_log (bool): Optional True or False, depending on whether you want to see duplicates reported in a ``WARNING`` log message. This is used mainly for internal purposes within ``bcompiler``.

        Returns:
            duplicates (set): a set of duplicated keys
        """
        wb = load_workbook(self.path)
        ws = wb.active
        col_a = next(ws.iter_cols())
        col_a = [item.value for item in col_a]
        seen: set = set()
        uniq = []
        dups: set = set()
        for x in col_a:
            if x not in seen:
                uniq.append(x)
                seen.add(x)
            else:
                dups.add(x)
        if to_log and len(dups) > 0:
            for x in dups:
                logger.warning(
                    f'{self.path} contains duplicate key: "{x}". Masters cannot contain duplicate keys. Rename them.'
                )
            return True
        elif to_log and len(dups) == 0:
            logger.info(f"No duplicate keys in {self.path}")
            return False
        elif len(dups) > 0:
            return dups
        else:
            return False

    def __repr__(self):
        return f"Master({self.path}, {self.quarter.quarter}, {self.quarter.year})"