"""This module contains the implementation of CompanyNameCleaner class from OS-Climate's financial-entity-cleaner package."""
import enum
import json
import logging
import re
from importlib.resources import files
from typing import Literal
import pandas as pd
from pydantic import BaseModel
[docs]
logger = logging.getLogger(__name__)
[docs]
CLEANING_RULES_DICT = {
"remove_email": [" ", r"\S*@\S*\s?"],
"remove_url": [" ", r"https*\S+"],
"remove_word_the_from_the_end": [" ", r"the$"],
"remove_word_the_from_the_beginning": [" ", r"^the"],
"place_word_the_at_the_beginning": [" ", r"the$"],
"remove_www_address": [" ", r"https?://[.\w]{3,}|www.[.\w]{3,}"],
"enforce_single_space_between_words": [" ", r"\s+"],
"replace_amperstand_by_AND": [" and ", r"&"],
"add_space_between_amperstand": [" & ", r"&"],
"add_space_before_opening_parentheses": [" (", r"\("],
"add_space_after_closing_parentheses": [") ", r"\)"],
"replace_amperstand_between_space_by_AND": [" and ", r"\s+&\s+"],
"replace_hyphen_by_space": [" ", r"-"],
"replace_hyphen_between_spaces_by_single_space": [" ", r"\s+-\s+"],
"replace_underscore_by_space": [" ", r"_"],
"replace_underscore_between_spaces_by_single_space": [" ", r"\s+_\s+"],
"remove_all_punctuation": [" ", r"([^\w\s])"],
"remove_punctuation_except_dot": [" ", r"([^\w\s.])"],
"remove_mentions": [" ", r"@\S+"],
"remove_hashtags": [" ", r"#\S+"],
"remove_numbers": [" ", r"\w*\d+\w*"],
"remove_text_puctuation": [" ", r'\;|\:|\,|\.|\?|\!|"'],
"remove_text_puctuation_except_dot": [" ", r'\;|\:|\,|\?|\!|"'],
"remove_math_symbols": [" ", r"\+|\-|\*|\>|\<|\=|\%"],
"remove_math_symbols_except_dash": [" ", r"\+|\*|\>|\<|\=|\%"],
"remove_parentheses": ["", r"\(|\)"],
"remove_brackets": ["", r"\[|\]"],
"remove_curly_brackets": ["", r"\{|\}"],
"remove_single_quote_next_character": [" ", r"'\w+"],
"remove_single_quote": [" ", r"'"],
"remove_double_quote": [" ", r'"'],
"remove_words_in_parentheses": [" ", r"\([^()]*\)"],
"repeat_remove_words_in_parentheses": [" ", r"remove_words_in_parentheses"],
}
[docs]
class LegalTermLocation(enum.Enum):
"""The location of the legal terms within the name string."""
[docs]
class CompanyNameCleaner(BaseModel):
"""Class to normalize/clean up text based company names."""
# Constants used internally by the class
[docs]
__NAME_LEGAL_TERMS_DICT_FILE = "us_legal_forms.json"
[docs]
__NAME_JSON_ENTRY_LEGAL_TERMS = "legal_forms"
#: A flag to indicate if the cleaning process must normalize
#: text's legal terms. e.g. LTD => LIMITED.
[docs]
cleaning_rules_list: list[str] = [
"remove_word_the_from_the_end",
"remove_word_the_from_the_beginning",
"replace_amperstand_between_space_by_AND",
"replace_hyphen_by_space",
"replace_hyphen_between_spaces_by_single_space",
"replace_underscore_by_space",
"replace_underscore_between_spaces_by_single_space",
"remove_all_punctuation",
"remove_numbers",
"remove_math_symbols",
"remove_words_in_parentheses",
"remove_parentheses",
"remove_brackets",
"remove_curly_brackets",
"enforce_single_space_between_words",
]
#: A flag to indicate if the cleaning process must normalize
[docs]
normalize_legal_terms: bool = True
#: Define if unicode characters should be removed from text's name
#: This cleaning rule is treated separated from the regex rules because it depends on the
#: language of the text's name. For instance, russian or japanese text's may contain
#: unicode characters, while portuguese and french companies may not.
[docs]
remove_unicode: bool = False
#: Define the letter case of the cleaning output
[docs]
output_lettercase: Literal["lower", "title"] = "lower"
#: Where in the string are legal terms found
[docs]
legal_term_location: LegalTermLocation = LegalTermLocation.AT_THE_END
#: Define if the letters with accents are replaced with non-accented ones
[docs]
remove_accents: bool = False
[docs]
def _apply_regex_rules(
self, str_value: str, dict_regex_rules: dict[str, list[str]]
) -> str:
r"""Applies several cleaning rules based on a custom dictionary.
The dictionary must contain cleaning rules written in regex format.
Arguments:
str_value (str): any value as string to be cleaned up.
dict_regex_rules (dict): a dictionary of cleaning rules writen in regex with the format
[rule name] : ['replacement', 'regex rule']
Returns:
(str): the modified/cleaned value.
"""
clean_value = str_value
# Iterate through the dictionary and apply each regex rule
for name_rule, cleaning_rule in dict_regex_rules.items():
# First element is the replacement
replacement = cleaning_rule[0]
# Second element is the regex rule
regex_rule = cleaning_rule[1]
# Check if the regex rule is actually a reference to another regex rule.
# By adding a name of another regex rule in the place of the rule itself allows the execution
# of a regex rule twice
if regex_rule in dict_regex_rules:
replacement = dict_regex_rules[cleaning_rule[1]][0]
regex_rule = dict_regex_rules[cleaning_rule[1]][1]
# Make sure to use raw string
regex_rule = rf"{regex_rule}"
# Treat the special case of the word THE at the end of a text's name
found_the_word_the = None
if name_rule == "place_word_the_at_the_beginning":
found_the_word_the = re.search(regex_rule, clean_value)
# Apply the regex rule
clean_value = re.sub(regex_rule, replacement, clean_value)
# Adjust the name for the case of rule <place_word_the_at_the_beginning>
if found_the_word_the is not None:
clean_value = "the " + clean_value
return clean_value
[docs]
def _remove_unicode_chars(self, value: str) -> str:
"""Removes unicode character that is unreadable when converted to ASCII format.
Arguments:
value (str): any string containing unicode characters.
Returns:
(str): the corresponding input string without unicode characters.
"""
# Remove all unicode characters if any
clean_value = value.encode("ascii", "ignore").decode()
return clean_value
[docs]
def _apply_cleaning_rules(self, company_name: str) -> str:
"""Apply the cleaning rules from the dictionary of regex rules."""
cleaning_dict = {}
for rule_name in self.cleaning_rules_list:
cleaning_dict[rule_name] = CLEANING_RULES_DICT[rule_name]
# Apply all the cleaning rules
clean_company_name = self._apply_regex_rules(company_name, cleaning_dict)
return clean_company_name
[docs]
def _apply_normalization_of_legal_terms(self, company_name: str) -> str:
"""Apply the normalizattion of legal terms according to dictionary of regex rules."""
# Make sure to remove extra spaces, so legal terms can be found in the end (if requested)
clean_company_name = company_name.strip()
# The dictionary of legal terms define how to normalize the text's legal form abreviations
json_source = files("pudl.package_data.settings").joinpath(
self.__NAME_LEGAL_TERMS_DICT_FILE
)
with json_source.open() as json_file:
_dict_legal_terms = json.load(json_file)[
self.__NAME_JSON_ENTRY_LEGAL_TERMS
]["en"]
# Apply normalization for legal terms
# Iterate through the dictionary of legal terms
for replacement, legal_terms in _dict_legal_terms.items():
# Each replacement has a list of possible terms to be searched for
replacement = " " + replacement.lower() + " "
for legal_term in legal_terms:
# Make sure to use raw string
legal_term = legal_term.lower()
# If the legal term has . (dots), then apply regex directly on the legal term
# Otherwise, if it's a legal term with only letters in sequence, make sure
# that regex find the legal term as a word (\\bLEGAL_TERM\\b)
if legal_term.find(".") > -1:
legal_term = legal_term.replace(".", "\\.")
else:
legal_term = "\\b" + legal_term + "\\b"
# Check if the legal term should be found only at the end of the string
if self.legal_term_location == LegalTermLocation.AT_THE_END:
legal_term = legal_term + "$"
# ...and it's a raw string
regex_rule = rf"{legal_term}"
# Apply the replacement
clean_company_name = re.sub(regex_rule, replacement, clean_company_name)
return clean_company_name
[docs]
def get_clean_data(self, company_name: str) -> str:
"""Clean a name and normalize legal terms.
If ``company_name`` is null or not a string value, pd.NA
will be returned.
Arguments:
company_name (str): the original text
Returns:
clean_company_name (str): the clean version of the text
"""
if not isinstance(company_name, str):
if company_name is not pd.NA:
logger.warning(f"{company_name} is not a string.")
return pd.NA
# Remove all unicode characters in the text's name, if requested
if self.remove_unicode:
clean_company_name = self._remove_unicode_chars(company_name)
else:
clean_company_name = company_name
# Remove space in the beginning and in the end and convert it to lower case
clean_company_name = clean_company_name.strip().lower()
# Apply all the cleaning rules
clean_company_name = self._apply_cleaning_rules(clean_company_name)
# Apply normalization for legal terms
if self.normalize_legal_terms:
clean_company_name = self._apply_normalization_of_legal_terms(
clean_company_name
)
# Apply the letter case, if different from 'lower'
if self.output_lettercase == "upper":
clean_company_name = clean_company_name.upper()
elif self.output_lettercase == "title":
clean_company_name = clean_company_name.title()
# Remove excess of white space that might be introduced during previous cleaning
clean_company_name = clean_company_name.strip()
clean_company_name = re.sub(r"\s+", " ", clean_company_name)
return clean_company_name
[docs]
def apply_name_cleaning(
self, df: pd.DataFrame, return_as_dframe: bool = False
) -> pd.DataFrame:
"""Clean up text names in a dataframe.
Arguments:
df (dataframe): the input dataframe that contains the text's name to be cleaned
return_as_dframe (bool): whether to return the cleaned data as a dataframe or series.
Useful to return as a dataframe if used in a cleaning pipeline with no
vectorization step after name cleaning. If multiple columns are passed in for
cleaning then output will be a dataframe regardless of this parameter.
Returns:
df (dataframe): the clean version of the input dataframe
"""
if isinstance(df, pd.DataFrame) and len(df.columns) > 1:
clean_df = pd.DataFrame()
for col in df.columns:
clean_df = pd.concat(
[clean_df, df[col].apply(self.get_clean_data)], axis=1
)
return clean_df
out = df.squeeze().apply(self.get_clean_data)
if return_as_dframe:
return out.to_frame()
return out