import datetime
import logging
import re
from datetime import date
from operator import itemgetter
from dateutil.parser import parse
logger = logging.getLogger("bcompiler.cleanser")
ENDASH_REGEX = r"–"
ENDASH_FIX = r"-"
COMMA_REGEX = r",\s?"
COMMA_FIX = r" "
APOS_REGEX = r"^'"
APOS_FIX = r""
TRAILING_SPACE_REGEX = r"(.+)( | )$"
DATE_REGEX = r"^(\d{2,4})(\/|-|\.)(\d{1,2})(\/|-|\.)(\d{2,4})"
DATE_REGEX_4 = r"^(\d{2,4})(/|-|\.)(\d{1,2})(/|-|\.)(\d{1,2})"
DATE_REGEX_TIME = r"^(\d{2,4})(/|-)(\d{1,2})(/|-)(\d{1,2})\s(0:00:00)"
INT_REGEX = r"^[-+]?\d+$"
FLOAT_REGEX = r"^[-+]?([0-9]*)\.[0-9]+$"
NL_REGEX = r"\n"
NL_FIX = r" | "
SPACE_PIPE_CHAR_REGEX = r"\ \|\S"
SPACE_PIPE_CHAR_FIX = r" | "
PERCENT_REGEX = r"^(\d{1,3})%$"
POUND_REGEX = r"^(-)?£(\d+(\.\d{1,2})?)(\d+)?$" # handles negative numbers
class Cleanser:
"""
Takes a string, and cleans it.
Doctests:
>>> t = "Text, with commas"
>>> c = Cleanser(t)
>>> c.clean()
'Text with commas'
>>> a = "\'Text with leading apos."
>>> c = Cleanser(a)
>>> c.clean()
'Text with leading apos.'
"""
def __init__(self, string):
self.string = string
# a list of dicts that describe everything needed to fix errors in
# string passed to class constructor. Method self.clean() runs through
# them, fixing each in turn.
self._checks = [
dict(
c_type="emdash",
rule=ENDASH_REGEX,
fix=ENDASH_FIX,
func=self._endash,
count=0,
),
dict(
c_type="commas",
rule=COMMA_REGEX,
fix=COMMA_FIX,
func=self._commas,
count=0,
),
dict(
c_type="leading_apostrophe",
rule=APOS_REGEX,
fix=APOS_FIX,
func=self._apostrophe,
count=0,
),
dict(c_type="newline",
rule=NL_REGEX,
fix=NL_FIX,
func=self._newline,
count=0),
dict(
c_type="double_space",
rule=" ",
fix=" ",
func=self._doublespace,
count=0,
),
dict(
c_type="trailing_space",
rule=TRAILING_SPACE_REGEX,
fix=None,
func=self._trailingspace,
count=0,
),
dict(
c_type="pipe_char",
rule=SPACE_PIPE_CHAR_REGEX,
fix=SPACE_PIPE_CHAR_FIX,
func=self._space_pipe_char,
count=0,
),
dict(c_type="date",
rule=DATE_REGEX,
fix=None,
func=self._date,
count=0),
dict(
c_type="date_time",
rule=DATE_REGEX_TIME,
fix=None,
func=self._date_time,
count=0,
),
dict(c_type="int",
rule=INT_REGEX,
fix=None,
func=self._int,
count=0),
dict(c_type="float",
rule=FLOAT_REGEX,
fix=None,
func=self._float,
count=0),
dict(
c_type="percent",
rule=PERCENT_REGEX,
fix=None,
func=self._percent,
count=0,
),
dict(c_type="pound",
rule=POUND_REGEX,
fix=None,
func=self._pound,
count=0),
]
self.checks_l = len(self._checks)
self._analyse()
def _sort_checks(self):
"""
Sorts the list of dicts in self._checks by their count, highest
first, so that when the fix methods run down them, they always have
a count with a value higher than 0 to run with, otherwise later
fixes might not get hit.
"""
self._checks = sorted(self._checks,
key=itemgetter("count"),
reverse=True)
def _endash(self, regex, fix):
"""
Turns – into -.
"""
return re.sub(regex, fix, self.string)
def _pound(self, regex, fix):
"""
Turns £12.24 into 12.24 (a float).
"""
m = re.match(regex, self.string)
sum_p = m.group(2)
if m.group(1) == "-":
return float(sum_p) * -1
else:
return float(sum_p)
def _percent(self, regex, fix):
"""
Turns 100% into 1.0.
"""
m = re.match(regex, self.string)
p = int(m.group(1))
return p / 100
def _float(self, regex, fix):
"""
Turns numbers that look like floats into floats.
"""
return float(self.string)
def _int(self, regex, fix):
"""
Turns numbers that look like integers into integers.
"""
return int(self.string)
def _date(self, regex, fix):
"""
Handles dates in "03/05/2016" format.
"""
# TODO: separate function needed here to assert that month values are in 'MM' format
# TODO: investigate rules for the year
m = re.match(regex, self.string)
if int(m.groups()[-1]) in range(1965, 1967):
logger.warning(
("Dates inputted as dd/mm/65 will migrate as dd/mm/2065. "
"Dates inputted as dd/mm/66 will migrate as dd/mm/1966."))
try:
if len(m.string.split("-")[0]) == 4: # year is first
return datetime.date(
int(m.string.split("-")[0]),
int(m.string.split("-")[1]),
int(m.string.split("-")[2]),
)
else:
return parse(m.string, dayfirst=True).date()
except IndexError:
pass
except ValueError:
logger.warning(
'Potential date issue (perhaps a date mixed with free text?): "{}"'
.format(self.string))
return self.string
def _date_time(self, regex, fix):
"""
Handles dates in "2017-05-01 0:00:00" format. We get this from the
csv file when we send it back out to templates/forms. Returns a Python
date object.
"""
m = re.match(regex, self.string)
year = int(m.group(1))
month = int(m.group(3))
day = int(m.group(5))
try:
return date(year, month, day)
except ValueError:
logger.error("Incorrect date format {}!".format(self.string))
return self.string
def _commas(self, regex, fix):
"""
Handles commas in self.string according to rule in self._checks
"""
# we want to sort the list first so self._checks has any item
# with a count > 0 up front, otherwise if a count of 0 appears
# before it in the list, the > 0 count never gets fixed
return re.sub(regex, fix, self.string)
def _apostrophe(self, regex, fix):
"""Handles apostrophes as first char of the string."""
return self.string.lstrip("'")
def _newline(self, regex, fix):
"""Handles newlines anywhere in string."""
return re.sub(regex, fix, self.string)
def _doublespace(self, regex, fix):
"""Handles double-spaces anywhere in string."""
return re.sub(regex, fix, self.string)
def _trailingspace(self, regex, fix):
"""Handles trailing space in the string."""
return self.string.strip()
def _space_pipe_char(self, regex, fix):
"""Handles space pipe char anywhere in string."""
return re.sub(regex, fix, self.string)
def _access_checks(self, c_type):
"""Helper method returns the index of rule in self._checks
when given a c_type"""
return self._checks.index(
next(item for item in self._checks if item["c_type"] == c_type))
def _analyse(self):
"""
Uses the self._checks table as a basis for counting the number of
each cleaning target required, and calling the appropriate method
to clean.
"""
i = 0
while i < self.checks_l:
matches = re.finditer(self._checks[i]["rule"], self.string)
if matches:
self._checks[i]["count"] += len(list(matches))
i += 1
def clean(self):
"""Runs each applicable cleaning action and returns the cleaned
string."""
self._sort_checks()
for check in self._checks:
if check["count"] > 0:
self.string = check["func"](check["rule"], check["fix"])
check["count"] = 0
else:
return self.string
return self.string