Source code for cltk.alphabet.fro
"""The normalizer aims to maximally reduce the variation between the orthography of texts written in the Anglo-Norman dialect
to bring it in line with “orthographe commune”. It is heavily inspired by Pope (1956).
Spelling variation is not consistent enough to ensure the highest accuracy; the normalizer in its current format should
therefore be used as a last resort.
The normalizer, word tokenizer, stemmer, lemmatizer, and list of stopwords for OF/MF were developed as part of Google Summer of Code 2017.
A full write-up of this work can be found at : https://gist.github.com/nat1881/6f134617805e2efbe5d275770e26d350
**References :** Pope, M.K. 1956. From Latin to Modern French with Especial Consideration of Anglo-Norman. Manchester: MUP.
Anglo-French spelling variants normalized to "orthographe commune", from M. K. Pope (1956)
- word-final d - e.g. vertud vs vertu
- use of <u> over <ou>
- <eaus> for <eus>, <ceaus> for <ceus>
- triphtongs:
- <iu> for <ieu>
- <u> for <eu>
- <ie> for <iee>
- <ue> for <uee>
- <ure> for <eure>
- "epenthetic vowels" - e.g. averai for avrai
- <eo> for <o>
- <iw>, <ew> for <ieux>
- final <a> for <e>
"""
import re
FRO_PATTERNS = [
("eaus$", "eus"),
("ceaus$", "ceus"),
("iu", "ieu"),
("((?<!^)|(?<!(e)))u(?!$)", "eu"),
("ie$", "iee"),
("ue$", "uee"),
("ure$", "eure"),
("eo$", "o"),
("iw$", "ieux"),
("ew$", "ieux"),
("a$", "e"),
("^en", "an"),
("d$", ""),
]
[docs]def build_match_and_apply_functions(pattern, replace):
"""Assemble regex patterns."""
def matches_rule(word):
return re.search(pattern, word)
def apply_rule(word):
return re.sub(pattern, replace, word)
return matches_rule, apply_rule
[docs]def normalize_fr(tokens: list[str]) -> list[str]:
"""Normalize Old and Middle French tokens.
TODO: Make work work again with a tokenizer.
"""
# from cltk.tokenizers.word import WordTokenizer
# string = string.lower()
# word_tokenizer = WordTokenizer("fro")
# tokens = word_tokenizer.tokenize(string)
rules = [
build_match_and_apply_functions(pattern, replace)
for (pattern, replace) in FRO_PATTERNS
]
normalized_text = []
for token in tokens:
for matches_rule, apply_rule in rules:
if matches_rule(token):
normalized = apply_rule(token)
normalized_text.append(normalized)
return normalized_text