aboutsummaryrefslogblamecommitdiffstats
path: root/datamaps/process/cleansers.py
blob: 7c5aa8d69d72d2658b8a8e33583a07c6ce6e59f5 (plain) (tree)
1
2
3
4
5
6
7
8
9
               
              



                               

                                 
                                                




















































































































































































                                                                              

                                                                                            
































































































                                                                                   
import datetime
import logging
import re
from datetime import date
from operator import itemgetter

from dateutil.parser import parse

logger = logging.getLogger("bcompiler.cleanser")

ENDASH_REGEX = r"–"
ENDASH_FIX = r"-"
COMMA_REGEX = r",\s?"
COMMA_FIX = r" "
APOS_REGEX = r"^'"
APOS_FIX = r""
TRAILING_SPACE_REGEX = r"(.+)( |  )$"
DATE_REGEX = r"^(\d{2,4})(\/|-|\.)(\d{1,2})(\/|-|\.)(\d{2,4})"
DATE_REGEX_4 = r"^(\d{2,4})(/|-|\.)(\d{1,2})(/|-|\.)(\d{1,2})"
DATE_REGEX_TIME = r"^(\d{2,4})(/|-)(\d{1,2})(/|-)(\d{1,2})\s(0:00:00)"
INT_REGEX = r"^[-+]?\d+$"
FLOAT_REGEX = r"^[-+]?([0-9]*)\.[0-9]+$"
NL_REGEX = r"\n"
NL_FIX = r" | "
SPACE_PIPE_CHAR_REGEX = r"\ \|\S"
SPACE_PIPE_CHAR_FIX = r" | "
PERCENT_REGEX = r"^(\d{1,3})%$"
POUND_REGEX = r"^(-)?£(\d+(\.\d{1,2})?)(\d+)?$"  # handles negative numbers


class Cleanser:
    """
    Takes a string, and cleans it.

    Doctests:
    >>> t = "Text, with commas"
    >>> c = Cleanser(t)
    >>> c.clean()
    'Text with commas'
    >>> a = "\'Text with leading apos."
    >>> c = Cleanser(a)
    >>> c.clean()
    'Text with leading apos.'

    """

    def __init__(self, string):
        self.string = string

        # a list of dicts that describe everything needed to fix errors in
        # string passed to class constructor. Method self.clean() runs through
        # them,  fixing each in turn.
        self._checks = [
            dict(
                c_type="emdash",
                rule=ENDASH_REGEX,
                fix=ENDASH_FIX,
                func=self._endash,
                count=0,
            ),
            dict(
                c_type="commas",
                rule=COMMA_REGEX,
                fix=COMMA_FIX,
                func=self._commas,
                count=0,
            ),
            dict(
                c_type="leading_apostrophe",
                rule=APOS_REGEX,
                fix=APOS_FIX,
                func=self._apostrophe,
                count=0,
            ),
            dict(c_type="newline",
                 rule=NL_REGEX,
                 fix=NL_FIX,
                 func=self._newline,
                 count=0),
            dict(
                c_type="double_space",
                rule="  ",
                fix=" ",
                func=self._doublespace,
                count=0,
            ),
            dict(
                c_type="trailing_space",
                rule=TRAILING_SPACE_REGEX,
                fix=None,
                func=self._trailingspace,
                count=0,
            ),
            dict(
                c_type="pipe_char",
                rule=SPACE_PIPE_CHAR_REGEX,
                fix=SPACE_PIPE_CHAR_FIX,
                func=self._space_pipe_char,
                count=0,
            ),
            dict(c_type="date",
                 rule=DATE_REGEX,
                 fix=None,
                 func=self._date,
                 count=0),
            dict(
                c_type="date_time",
                rule=DATE_REGEX_TIME,
                fix=None,
                func=self._date_time,
                count=0,
            ),
            dict(c_type="int",
                 rule=INT_REGEX,
                 fix=None,
                 func=self._int,
                 count=0),
            dict(c_type="float",
                 rule=FLOAT_REGEX,
                 fix=None,
                 func=self._float,
                 count=0),
            dict(
                c_type="percent",
                rule=PERCENT_REGEX,
                fix=None,
                func=self._percent,
                count=0,
            ),
            dict(c_type="pound",
                 rule=POUND_REGEX,
                 fix=None,
                 func=self._pound,
                 count=0),
        ]
        self.checks_l = len(self._checks)
        self._analyse()

    def _sort_checks(self):
        """
        Sorts the list of dicts in self._checks by their count, highest
        first, so that when the fix methods run down them, they always have
        a count with a value higher than 0 to run with, otherwise later
        fixes might not get hit.
        """
        self._checks = sorted(self._checks,
                              key=itemgetter("count"),
                              reverse=True)

    def _endash(self, regex, fix):
        """
        Turns – into -.
        """
        return re.sub(regex, fix, self.string)

    def _pound(self, regex, fix):
        """
        Turns £12.24 into 12.24 (a float).
        """
        m = re.match(regex, self.string)
        sum_p = m.group(2)
        if m.group(1) == "-":
            return float(sum_p) * -1
        else:
            return float(sum_p)

    def _percent(self, regex, fix):
        """
        Turns 100% into 1.0.
        """
        m = re.match(regex, self.string)
        p = int(m.group(1))
        return p / 100

    def _float(self, regex, fix):
        """
        Turns numbers that look like floats into floats.
        """
        return float(self.string)

    def _int(self, regex, fix):
        """
        Turns numbers that look like integers into integers.
        """
        return int(self.string)

    def _date(self, regex, fix):
        """
        Handles dates in "03/05/2016" format.
        """
        # TODO: separate function needed here to assert that month values are in 'MM' format
        # TODO: investigate rules for the year
        m = re.match(regex, self.string)
        if int(m.groups()[-1]) in range(1965, 1967):
            logger.warning(
                ("Dates inputted as dd/mm/65 will migrate as dd/mm/2065. "
                 "Dates inputted as dd/mm/66 will migrate as dd/mm/1966."))
        try:
            if len(m.string.split("-")[0]) == 4:  #  year is first
                return datetime.date(
                    int(m.string.split("-")[0]),
                    int(m.string.split("-")[1]),
                    int(m.string.split("-")[2]),
                )
            else:
                return parse(m.string, dayfirst=True).date()
        except IndexError:
            pass
        except ValueError:
            logger.warning(
                'Potential date issue (perhaps a date mixed with free text?): "{}"'
                .format(self.string))
            return self.string

    def _date_time(self, regex, fix):
        """
        Handles dates in "2017-05-01 0:00:00" format. We get this from the
        csv file when we send it back out to templates/forms. Returns a Python
        date object.
        """
        m = re.match(regex, self.string)
        year = int(m.group(1))
        month = int(m.group(3))
        day = int(m.group(5))
        try:
            return date(year, month, day)
        except ValueError:
            logger.error("Incorrect date format {}!".format(self.string))
            return self.string

    def _commas(self, regex, fix):
        """
        Handles commas in self.string according to rule in self._checks
        """
        # we want to sort the list first so self._checks has any item
        # with a count > 0 up front, otherwise if a count of 0 appears
        # before it in the list, the > 0 count never gets fixed
        return re.sub(regex, fix, self.string)

    def _apostrophe(self, regex, fix):
        """Handles apostrophes as first char of the string."""
        return self.string.lstrip("'")

    def _newline(self, regex, fix):
        """Handles newlines anywhere in string."""
        return re.sub(regex, fix, self.string)

    def _doublespace(self, regex, fix):
        """Handles double-spaces anywhere in string."""
        return re.sub(regex, fix, self.string)

    def _trailingspace(self, regex, fix):
        """Handles trailing space in the string."""
        return self.string.strip()

    def _space_pipe_char(self, regex, fix):
        """Handles space pipe char anywhere in string."""
        return re.sub(regex, fix, self.string)

    def _access_checks(self, c_type):
        """Helper method returns the index of rule in self._checks
        when given a c_type"""
        return self._checks.index(
            next(item for item in self._checks if item["c_type"] == c_type))

    def _analyse(self):
        """
        Uses the self._checks table as a basis for counting the number of
        each cleaning target required, and calling the appropriate method
        to clean.
        """
        i = 0
        while i < self.checks_l:
            matches = re.finditer(self._checks[i]["rule"], self.string)
            if matches:
                self._checks[i]["count"] += len(list(matches))
            i += 1

    def clean(self):
        """Runs each applicable cleaning action and returns the cleaned
        string."""
        self._sort_checks()
        for check in self._checks:
            if check["count"] > 0:
                self.string = check["func"](check["rule"], check["fix"])
                check["count"] = 0
            else:
                return self.string
        return self.string