Source code for cltk.tokenizers.enm

""" Code for word tokenization: Middle English
"""

__author__ = ["Patrick J. Burns <patrick@diyclassics.org>"]
__license__ = "MIT License."

from cltk.tokenizers.word import RegexWordTokenizer

MiddleEnglishTokenizerPatterns = [
    (r"-", r" - "),
    (r"\n", r" "),
    (r"(?<=.)(?=[\.\";\,\:\[\]\(\)!&?])", r" "),
    (r"(?<=[\.\";\,\:\[\]\(\)!&?])(?=.)", r" "),
    (r"\s+", r" "),
]


[docs]class MiddleEnglishWordTokenizer(RegexWordTokenizer): """ A regex-based tokenizer for Middle English. """ def __init__(self): super().__init__(patterns=MiddleEnglishTokenizerPatterns)