Source code for diggrtoolbox.linking.link

#!/usr/bin/env python3
"""
link module for linking datasets
"""

from itertools import product
import random
import Levenshtein as lev
from .rules import *
from .helpers import remove_tm, std, load_series, remove_numbers
from .config import *

__author__ = "Florian Rämisch and Peter Mühleder"
__copyright = "Copyright 2017, Universitätsbibliothek Leipzig"
__email__ = "team@diggr.link"


#MATCHING RULES & FILTERS
ALL_RULES = [first_letter_rule, numbering_rule]

#PREPROCESSING
REMOVE_SERIES = load_series()

def _pre_processing(a):
    """
    inital steps of preparing string :a: for matching
    """
    if a:
        a = remove_tm(a)
        a = a.replace("Ⅱ", "II")
        a = a.split("(")[0]
        a = a.replace("〔", "").replace("〕","")

        for series in REMOVE_SERIES:
            if series in a:
                a = a.replace(series+" Series","")
                break
    return a.strip()

[docs]def match_titles(titles_a,titles_b, rules=ALL_RULES): """ Returns match value for two lists of titles. :titles_a: List of title strings :titles_b: List of title string :rules: List of matching rules """ best_ratio = 0 for a, b in product(titles_a, titles_b): a = _pre_processing(a) b = _pre_processing(b) if a and b: weights = [ rule(a,b) for rule in rules ] a_no_numbers = remove_numbers(a) b_no_numbers = remove_numbers(b) r = lev.ratio(std(a_no_numbers),std(b_no_numbers)) - sum(weights) if r > best_ratio: best_ratio = r return best_ratio