Source code for diggrtoolbox.linking.link

#!/usr/bin/env python3
"""
link module for linking datasets
"""

from itertools import product
import random
import Levenshtein as lev
from .rules import *
from .helpers import remove_tm, std, load_series, remove_numbers
from .config import *

__author__ = "Florian Rämisch and Peter Mühleder"
__copyright = "Copyright 2017, Universitätsbibliothek Leipzig"
__email__ = "team@diggr.link"


#MATCHING RULES & FILTERS
ALL_RULES = [first_letter_rule, numbering_rule]

#PREPROCESSING
REMOVE_SERIES = load_series()

def _pre_processing(a):
    """
    inital steps of preparing string :a: for matching
    """
    if a:
        a = remove_tm(a)
        a = a.replace("Ⅱ", "II")
        a = a.split("(")[0]
        a = a.replace("〔", "").replace("〕","")

        for series in REMOVE_SERIES:
            if series in a:
                a = a.replace(series+" Series","")
                break
    return a.strip()

[docs]def match_titles(titles_a,titles_b, rules=ALL_RULES):
    """
    Returns match value for two lists of titles.

    :titles_a: List of title strings
    :titles_b: List of title string
    :rules:    List of matching rules
    """
    best_ratio = 0
    for a, b in product(titles_a, titles_b):
        a = _pre_processing(a)
        b = _pre_processing(b)

        if a and b:
            weights = [ rule(a,b) for rule in rules ]

            a_no_numbers = remove_numbers(a)
            b_no_numbers = remove_numbers(b)

            r = lev.ratio(std(a_no_numbers),std(b_no_numbers)) - sum(weights)

            if r > best_ratio: best_ratio = r

    return best_ratio
Source code for diggrtoolbox.linking.link

Navigation

Related Topics