Source code for cltk.tokenizers.lat.utils
""" Tokenization utilities: Latin
"""
__author__ = ["Patrick J. Burns <patrick@diyclassics.org>"]
__license__ = "MIT License."
import pickle
from typing import Any, Generator, Set
from nltk.tokenize.punkt import PunktLanguageVars, PunktSentenceTokenizer, PunktTrainer
from cltk.tokenizers.lat.params import ABBREVIATIONS
from cltk.tokenizers.utils import SentenceTokenizerTrainer
[docs]class LatinSentenceTokenizerTrainer(SentenceTokenizerTrainer):
""" """
def __init__(self, strict: bool = False):
self.strict = strict
self.punctuation = [".", "?", "!"]
self.strict_punctuation = [";", ":", "—"]
self.abbreviations = ABBREVIATIONS
super().__init__(
language="lat",
punctuation=self.punctuation,
strict=self.strict,
strict_punctuation=self.strict_punctuation,
abbreviations=self.abbreviations,
)