datamaps/plugins/dft/master.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273

import re
import datetime
import logging
import unicodedata
from pathlib import Path
from typing import List, Tuple, Iterable, Optional, Any

from datamaps.plugins.dft.portfolio import project_data_from_master
from datamaps.process.cleansers import DATE_REGEX_4
from datamaps.core.temporal import Quarter

from openpyxl import load_workbook

logger = logging.getLogger("bcompiler.utils")


class ProjectData:
    """
    ProjectData class
    """

    def __init__(self, d: dict) -> None:
        """
        :py:func:`OrderedDict` is easiest to get from project_data_from_master[x]
        """
        self._data = d

    def __len__(self) -> int:
        return len(self._data)

    def __getitem__(self, item):
        return self._data[item]

    def key_filter(self, key: str) -> List[Tuple]:
        """
        Return a list of (k, v) tuples if k in master key.
        """
        data = [item for item in self._data.items() if key in item[0]]
        if not data:
            raise KeyError("Sorry, there is no matching data")
        return data

    def pull_keys(self, input_iter: Iterable, flat=False) -> List[Tuple[Any, ...]]:
        """
        Returns a list of (key, value) tuples from ProjectData if key matches a
        key. The order of tuples is based on the order of keys passed in the iterable.
        """
        if flat is True:
            # search and replace troublesome EN DASH character
            xs = [
                item
                for item in self._data.items()
                for i in input_iter
                if item[0]
                .strip()
                .replace(
                    unicodedata.lookup("EN DASH"), unicodedata.lookup("HYPHEN-MINUS")
                )
                == i
            ]
            xs = [_convert_str_date_to_object(x) for x in xs]
            ts = sorted(
                xs,
                key=lambda x: input_iter.index(
                    x[0]
                    .strip()
                    .replace(
                        unicodedata.lookup("EN DASH"),
                        unicodedata.lookup("HYPHEN-MINUS"),
                    )
                ),
            )
            ts = [item[1] for item in ts]
            return ts
        else:
            xs = [
                item
                for item in self._data.items()
                for i in input_iter
                if item[0].replace(
                    unicodedata.lookup("EN DASH"), unicodedata.lookup("HYPHEN-MINUS")
                )
                == i
            ]
            xs = [
                item for item in self._data.items() for i in input_iter if item[0] == i
            ]
            xs = [_convert_str_date_to_object(x) for x in xs]
            ts = sorted(
                xs,
                key=lambda x: input_iter.index(
                    x[0].replace(
                        unicodedata.lookup("EN DASH"),
                        unicodedata.lookup("HYPHEN-MINUS"),
                    )
                ),
            )
            return ts

    def __repr__(self):
        return f"ProjectData() - with data: {id(self._data)}"


def _convert_str_date_to_object(d_str: tuple) -> Tuple[str, Optional[datetime.date]]:
    try:
        if re.match(DATE_REGEX_4, d_str[1]):
            try:
                ds = d_str[1].split("-")
                return (d_str[0], datetime.date(int(ds[0]), int(ds[1]), int(ds[2])))
            except TypeError:
                return d_str
        else:
            return d_str
    except TypeError:
        return d_str


class Master:
    """A Master object, representing the main central data item in ``datamaps``.

    Args:
        quarter (:py:class:`bcompiler.api.Quarter`): creating using ``Quarter(1, 2017)`` for example.
        path (str): path to the master xlsx file

    A master object is a composition between a :py:class:`datamaps.api.Quarter` object and an
    actual master xlsx file on disk.

    You create one, either by creating the Quarter object first, and using that as the first
    parameter of the ``Master`` constructor, e.g.::

        from bcompiler.api import Quarter
        from bcompiler.api import Master

        q1 = Quarter(1, 2016)
        m1 = Master(q1, '/tmp/master_1_2016.xlsx')

    or by doing both in one::

        m1 = Master(Quarter(1, 2016), '/tmp/master_1_2016.xlsx')

    Once you have a ``Master`` object, you can access project data from it, like this::

        project_data = m1['Project Title']


    The following *attributes* are available on `m1` once created as such, e.g.::

        data = m1.data
        quarter = m1.quarter
        filename = m1.filename
        ..etc
    """

    def __init__(
        self, quarter: Quarter, path: str, declared_month: Optional[int] = None
    ) -> None:
        self._quarter = quarter
        self._declared_month = declared_month
        self.path = path
        self._data = project_data_from_master(self.path)
        self._project_titles = [item for item in self.data.keys()]
        self.year = self._quarter.year

    def __getitem__(self, project_name):
        return ProjectData(self._data[project_name])

    @property
    def data(self):
        """Return all the data contained in the master in a large, nested dictionary.

        The resulting data structure contains a dictionary of :py:class:`colletions.OrderedDict` items whose
        key is the name of a project::

            "Project Name": OrderedDict("key": "value"
                                        ...)

        This object can then be further interrogated, for example to obtain all key/values
        from a partictular project, by doing::

            d = Master.data
            project_data = d['PROJECT_NAME']

        """
        return self._data

    @property
    def quarter(self):
        """Returns the ``Quarter`` object associated with the ``Master``.

        Example::

            q1 = m.quarter

        ``q1`` can then be further interrogated as documented in :py:class:`core.temporal.Quarter`.

        """

        return self._quarter

    @property
    def month(self):
        """
        Returns the ``Month`` object associated with the ``Master``.
        """
        months = {
            1: "January",
            2: "February",
            3: "March",
            4: "April",
            5: "May",
            6: "June",
            7: "July",
            8: "August",
            9: "September",
            10: "October",
            11: "November",
            12: "December",
        }
        return [
            m.month
            for m in self.quarter.months
            if m.month == months[self._declared_month]
        ][0]

    @property
    def filename(self):
        """The filename of the master xlsx file, e.g. ``master_1_2017.xlsx``."""
        p = Path(self.path)
        return p.name

    @property
    def projects(self):
        """A list of project titles derived from the master xlsx."""
        return self._project_titles

    def duplicate_keys(self, to_log=None):
        """Checks for duplicate keys in a master xlsx file.

        Args:
            to_log (bool): Optional True or False, depending on whether you want to see duplicates reported in a ``WARNING`` log message. This is used mainly for internal purposes within ``bcompiler``.

        Returns:
            duplicates (set): a set of duplicated keys
        """
        wb = load_workbook(self.path)
        ws = wb.active
        col_a = next(ws.iter_cols())
        col_a = [item.value for item in col_a]
        seen: set = set()
        uniq = []
        dups: set = set()
        for x in col_a:
            if x not in seen:
                uniq.append(x)
                seen.add(x)
            else:
                dups.add(x)
        if to_log and len(dups) > 0:
            for x in dups:
                logger.warning(
                    f'{self.path} contains duplicate key: "{x}". Masters cannot contain duplicate keys. Rename them.'
                )
            return True
        elif to_log and len(dups) == 0:
            logger.info(f"No duplicate keys in {self.path}")
            return False
        elif len(dups) > 0:
            return dups
        else:
            return False

    def __repr__(self):
        return f"Master({self.path}, {self.quarter.quarter}, {self.quarter.year})"