Source code for cltk.alphabet.gmh

"""The alphabet for Middle High German. Source:

- *Schreibkonventionen des klassischen Mittelhochdeutschen*, Simone Berchtold
- https://de.wikipedia.org/wiki/Mittelhochdeutsch

The consonants of Middle High German are categorized as:

- Stops: ⟨p t k/c/q b d g⟩
- Affricates: ⟨pf/ph tz/z⟩
- Fricatives: ⟨v f s ȥ sch ch h⟩
- Nasals: ⟨m n⟩
- Liquids: ⟨l r⟩
- Semivowels: ⟨w j⟩

Misc. notes:

- c is used at the beginning of only loanwords and is pronounced the same as k (e.g. calant, cappitain)
- Double consonants are pronounced the same way as their corresponding letters in Modern Standard German (e.g. pp/p)
- schl, schm, schn, schw are written in MHG as sw, sl, sm, sn
- æ (also seen as ae), œ (also seen as oe) and iu denote the use of Umlaut over â, ô and û respectively
- ȥ or ʒ is used in modern handbooks and grammars to indicate the s or s-like sound which arose from Germanic t in the High German consonant shift.

>>> from cltk.alphabet import gmh
>>> gmh.CONSONANTS[:5]
['b', 'd', 'g', 'h', 'f']
>>> gmh.VOWELS[:5]
['a', 'ë', 'e', 'i', 'o']
"""

import re
import unicodedata

ALPHABET = [
    "a",
    "ë",
    "e",
    "i",
    "o",
    "u",
    "ä",
    "ö",
    "ü",
    "â",
    "ê",
    "î",
    "ô",
    "û",
    "æ",
    "œ",
    "iu",
    "b",
    "d",
    "g",
    "h",
    "f",
    "c",
    "j",
    "k",
    "l",
    "m",
    "n",
    "p",
    "q",
    "r",
    "s",
    "t",
    "v",
    "w",
    "z",
    "ȥ",
]


CONSONANTS = [
    "b",
    "d",
    "g",
    "h",
    "f",
    "c",
    "j",
    "k",
    "l",
    "m",
    "n",
    "p",
    "q",
    "r",
    "s",
    "t",
    "v",
    "w",
    "z",
]

VOWELS = [
    "a",
    "ë",
    "e",
    "i",
    "o",
    "u",
    "ä",
    "ö",
    "ü",
    "â",
    "ê",
    "î",
    "ô",
    "û",
    "æ",
    "œ",
    "iu",
]

SHORT_VOWELS = ["a", "ë", "e", "i", "o", "u", "ä", "ö", "ü"]

LONG_VOWELS = ["â", "ê", "î", "ô", "û", "æ", "œ", "iu"]

DIPHTHONGS = ["ei", "ie", "ou", "öu", "uo", "üe", "ch", "ng", "nt"]
TRIPHTHONGS = ["sch"]


[docs]def normalize_middle_high_german(
    text: str,
    to_lower_all: bool = True,
    to_lower_beginning: bool = False,
    alpha_conv: bool = True,
    punct: bool = True,
    ascii: bool = False,
) -> str:
    """Normalize input string.

    >>> from cltk.alphabet import gmh
    >>> from cltk.languages.example_texts import get_example_text
    >>> gmh.normalize_middle_high_german(get_example_text("gmh"))[:50]
    'uns ist in alten\\nmæren wunders vil geseit\\nvon hele'

    :param text:
    :param to_lower_beginning:
    :param to_lower_all: convert whole text to lowercase
    :param alpha_conv: convert alphabet to canonical form
    :param punct: remove punctuation
    :param ascii: returns ascii form
    :return: normalized text
    """

    if to_lower_all:
        text = text.lower()
    if to_lower_beginning:
        text = text[0].lower() + text[1:]
        text: str = re.sub(r"(?<=[\.\?\!]\s)(\w)", lambda x: x.group(1).lower(), text)
    if alpha_conv:
        text = (
            text.replace("ē", "ê")
            .replace("ī", "î")
            .replace("ā", "â")
            .replace("ō", "ô")
            .replace("ū", "û")
        )
        text = text.replace("ae", "æ").replace("oe", "œ")
    if punct:
        text = re.sub(r"[\.\";\,\:\[\]\(\)!&?‘]", "", text)
    if ascii:
        text_raw: bytes = unicodedata.normalize("NFKD", text).encode(
            "ASCII", "ignore"
        )  # Encode into ASCII, returns a bytestring
        text = text_raw.decode("utf-8")  # Convert back to string
    return text
Source code for cltk.alphabet.gmh

The Classical Language Toolkit

Navigation

Related Topics