Source code for diggrtoolbox.linking.rules

#!/usr/bin/env python3
"""
module contains general matching rules
"""


__author__ = "Florian Rämisch and Peter Mühleder"
__copyright = "Copyright 2017, Universitätsbibliothek Leipzig"
__email__ = "team@diggr.link"


import re
import Levenshtein as lev
from .helpers import std, extract_all_numbers
from .config import *


#REGULAR EXPRESSIONS
NUMBERING_REGEX = r'(\d+.\d+|\d+)'


[docs]def numbering_rule(a, b): """ Check two stings for number at the end or inbetween followed by a colon. If a number is found in both strings and if they do not match, return penalty value. """ x, y = "nan", "nan" x_str = "" y_str = "" x_pos = "" y_pos = "" nums_a = extract_all_numbers(a) nums_b = extract_all_numbers(b) if nums_a != []: x = nums_a[0]["value"] x_str = nums_a[0]["str"] x_pos = nums_a[0]["position"][0] if nums_b != []: y = nums_b[0]["value"] y_str = nums_b[0]["str"] y_pos = nums_b[0]["position"][0] if x_pos == "middle" and y == "nan": check = a.replace(x_str, "") if lev.ratio(std(check), std(b)) == 1: return 0 if y_pos == "middle" and x == "nan": check = b.replace(y_str, "") if lev.ratio(std(check), std(a)) == 1: return 0 if x == y: return 0 else: return NUMBERING_WEIGHT
[docs]def first_letter_rule(a,b): """ checks if first letters of strings :a: and :b: when the strings contain max. 1 word """ if a and b: if len(a.split(" ")) == 1 and len(b.split(" ")) == 1: if a[0].lower() != b[0].lower(): return FIRST_LETTER_WEIGHT return 0