Source code for cltk.tokenizers.fro

""" Code for word tokenization: Old French
"""

__author__ = [
    "Natasha Voake <natashavoake@gmail.com>",
    "Patrick J. Burns <patrick@diyclassics.org>",
]
__license__ = "MIT License."

from cltk.tokenizers.word import RegexWordTokenizer

OldFrenchTokenizerPatterns = [
    (r"’", r"'"),
    (r"\'", r"' "),
    (r"(?<=.)(?=[.!?)(\";:,«»\-])", " "),
]


[docs]class OldFrenchWordTokenizer(RegexWordTokenizer): """ A regex-based tokenizer for Old French. """ def __init__(self): super().__init__(patterns=OldFrenchTokenizerPatterns)