Source code for cltk.tokenizers.gmh

""" Code for word tokenization: Middle High German
"""

__author__ = ["Patrick J. Burns <patrick@diyclassics.org>"]
__license__ = "MIT License."

from cltk.tokenizers.word import RegexWordTokenizer

# As far as I know, hyphens were never used for compounds, so the tokenizer treats all hyphens as line-breaks
MiddleHighGermanTokenizerPatterns = [
    (r"-\n", r"-"),
    (r"\n", r" "),
    (r"(?<=.)(?=[\.\";\,\:\[\]\(\)!&?])", r" "),
    (r"(?<=[\.\";\,\:\[\]\(\)!&?])(?=.)", r" "),
    (r"\s+", r" "),
]


[docs]class MiddleHighGermanWordTokenizer(RegexWordTokenizer): """ A regex-based tokenizer for Middle High German. """ def __init__(self): super().__init__(patterns=MiddleHighGermanTokenizerPatterns)