Source code for cltk.tokenizers.non

""" Code for word tokenization: Old Norse
"""

__author__ = [
    "Clément Besnier <clemsciences@gmail.com>",
    "Patrick J. Burns <patrick@diyclassics.org>",
]
__license__ = "MIT License."

from cltk.tokenizers.word import RegexWordTokenizer

# As far as I know, hyphens were never used for compounds, so the tokenizer treats all hyphens as line-breaks
OldNorseTokenizerPatterns = [(r"\'", r"' "), (r"(?<=.)(?=[.!?)(\";:,«»\-])", " ")]


[docs]class OldNorseWordTokenizer(RegexWordTokenizer): """A regex-based tokenizer for Old Norse.""" def __init__(self): super().__init__(patterns=OldNorseTokenizerPatterns)