Source code for diggrtoolbox.linking.helpers

#!/usr/bin/env python3
"""
diggrlink helpers module contains helper functions used for dataset linking
"""


import roman
import re
import string
import os

__author__ = "Florian Rämisch and Peter Mühleder"
__copyright = "Copyright 2017, Universitätsbibliothek Leipzig"
__email__ = "team@diggr.link"


#CONSTANTS
PUNCT_TRANSTABLE = str.maketrans("","",".,:-〔〕'’*/!&?+ ")
REMOVE_TM = ["™","®","(TM)", "(R)"]


#REGULAR EXPRESSIONS
ROMAN_NUMERAL_REGEX = r'\b(M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|C?D|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|X?L|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|I?V|V?I{1,3}))\b'
NUMBERING_REGEX = r'(\d+\.\d+|\d+)'

NUMBERING_RE = re.compile(NUMBERING_REGEX)
ROMAN_NUMERAL_RE = re.compile(ROMAN_NUMERAL_REGEX)


[docs]def load_excluded_titles():
    """
    Load list of excudled titles from resource file
    """
    file_name = os.path.join(os.path.dirname(__file__), "resources/exclude.txt")
    with open(file_name) as f:
        excluded_titles = f.readlines()
    return [ x.strip() for x in excluded_titles ]


[docs]def load_series():
    """
    Load list of series to remove from title
    """
    file_name = os.path.join(os.path.dirname(__file__), "resources/series.txt")
    with open(file_name) as f:
        series = f.readlines()
    return [ x.strip() for x in series ]


[docs]def remove_tm(a):
    """
    Removes trademark symbols from string :a:
    """
    for t in REMOVE_TM:
        a = a.replace(t, "")
    return a.strip()


[docs]def word_before_after(a, sep):
    """
    returns word before and after :sep: in string :a:
    """
    word_before, word_after = "", ""
    if sep in a:
        word_before = a.split(str(sep))[0].strip().split(" ")[-1]
        word_after = a.split(str(sep))[1].strip().split(" ")[0]
    return word_before, word_after


[docs]def std(a):
    """
    standardizes string :a: (removes punctuation, blanks, macrons; sets string to lower case)
    """
    if a:
        a = a.replace("The"," "). replace("・", " ").replace("THE", " ").replace("the", " ")
        #remove punctuations
        a = a.translate(PUNCT_TRANSTABLE)
        #remove macrons
        a = a.replace("ō", "o").replace("Ō", "O").replace("ū", "u").replace("Ū", "U")
        a = a.replace("ou", "o").replace("Ou", "O").replace("uu", "u").replace("Uu", "U").replace("nb", "mb")
        #remove blanks, lower case, strip string
        a = a.lower()
        return a
    else:
        return ""

def _get_position(a, n):
    """ returns position of substring :n: as "start", "end" or "middle" """
    position = a.index(n)
    if position == 0:
        return ("start", position)
    elif position+len(n) == len(a):
        return ("end", position)
    else:
        return ("middle", position)


def _extract_roman_numerals(a):
    """ extracts all roman numerals in string :a:, including their position and numerical value """
    rv = ROMAN_NUMERAL_RE.findall(a)
    rv = [no[0] for no in rv ]

    numbers = []

    for n in rv:
        position = _get_position(a, n)

        numbers.append({
            "type": "roman",
            "value": float(roman.fromRoman(n)),
            "position": position,
            "str": n
        })
    return numbers

def _extract_numbers(a):
    """
    returns all numbers in string :a:, their position and value als float.
    if number is identified as year, only the last two digits get set as value
    """

    rv = NUMBERING_RE.findall(a)

    numbers = []

    for n in rv:

        position = _get_position(a, n)

        #check if year
        if len(n) == 4 and n[0] in "12" and "." not in n:
            ntype = "year"
            value = int(n[2:])
        elif len(n) == 2 and n[0] in "890" and "." not in n:
            ntype = "year"
            value = int(n)
        #elso convert value to float
        else:
            ntype = "number"
            value = float(n)

        numbers.append({
            "type": ntype,
            "value": value,
            "position": position,
            "str": n
        })

    return numbers

[docs]def extract_all_numbers(a):
    """ returns all numbers (roman and arabic) in string :a: """

    numbers = _extract_numbers(a) + _extract_roman_numerals(a)
    sorted_numbers = sorted(numbers, key=lambda x: -x["position"][1])

    return sorted_numbers


[docs]def remove_numbers(a):
    """ removes all numbers (arabic and roman) from string a """
    a = NUMBERING_RE.sub("", a)
    a = ROMAN_NUMERAL_RE.sub("",a)
    a = re.sub(" +", " ", a)
    return a.replace(" :", ":").strip()
Source code for diggrtoolbox.linking.helpers

Navigation

Related Topics