Source code for diggrtoolbox.standardize.standardize

import string
import re
import os

PUNCT_TRANSTABLE = str.maketrans("","", string.punctuation)

[docs]def remove_html(s): """ Removes html tags from string :s: . """ if s: s = re.sub(r'<[^<]+?>', '', s) s = " ".join(s.split()) s = s.strip() return s
[docs]def remove_punctuation(s): """ Removes punctuation from string """ if s: s = s.translate(PUNCT_TRANSTABLE) return s
[docs]def remove_bracketed_text(s): """ Removes text in brackets from string :s: . """ s = re.sub(r'\([^\()]+?\)', '', s) s = re.sub(r'\[[^\[]]+?\]', '', s) s = re.sub(r'\[[^\[]]+?\]', '', s) return s.strip()
[docs]def std_url(url): """ Standardizes urls by removing protocoll and final slash. """ if url: url = url.split("//")[-1] if url.endswith("/"): url = url[:len(url)-1] return url
[docs]def std(s, lower=True, rm_punct=True, rm_bracket=True, rm_spaces=False, rm_strings=None): """ Combined string stardardization function. :lower: lower case :rm_punct: remove punctuation :rm_bracket: remove brackets () [] :rm_spaces: remove white spaces :rm_stirng: list of substrings to be removed from string before comparison """ if s: if lower: s = s.lower() if rm_punct: s = remove_punctuation(s) if rm_strings: for form in rm_strings: s = s.replace(form.lower(), "") if rm_bracket: s = remove_bracketed_text(s) if rm_spaces: s = s.replace(" ", "") s = s.strip() return s