Source code for cltk.sentence.processes
"""Module for sentence tokenizers.
"""
from copy import copy
from dataclasses import dataclass
from boltons.cacheutils import cachedproperty
from cltk.core import CLTKException
from cltk.core.data_types import Doc, Process
from cltk.sentence.non import OldNorseRegexSentenceTokenizer
from cltk.sentence.sentence import SentenceTokenizer
__author__ = ["Clément Besnier <clem@clementbesnier.fr>"]
[docs]@dataclass
class SentenceTokenizationProcess(Process):
"""To be inherited for each language's tokenization declarations.
Example: ``SentenceTokenizationProcess`` -> ``OldNorseTokenizationProcess``
>>> from cltk.tokenizers.processes import TokenizationProcess
>>> from cltk.core.data_types import Process
>>> issubclass(SentenceTokenizationProcess, Process)
True
>>> tok = SentenceTokenizationProcess()
"""
model = None
@cachedproperty
def algorithm(self):
raise CLTKException(
f"No sentence tokenization algorithm for language '{self.language}'."
)
[docs] def run(self, input_doc: Doc) -> Doc:
output_doc = copy(input_doc)
sentence_tokenizer = self.algorithm
if not isinstance(sentence_tokenizer, SentenceTokenizer):
raise CLTKException(
"Algorithm must be an instance of SentenceTokenizer subclass"
)
sentences = sentence_tokenizer.tokenize(output_doc.raw, self.model)
sentence_indices = []
for i, sentence in enumerate(sentences):
if i >= 1:
sentence_indices.append(sentence_indices[-1] + len(sentences[i]))
else:
sentence_indices.append(len(sentence))
sentence_index = 0
for j, word in enumerate(output_doc.words):
if sentence_indices[
sentence_index
] < word.index_char_stop and sentence_index + 1 < len(sentence_indices):
sentence_index += 1
word.index_sentence = sentence_index
return output_doc
[docs]@dataclass
class OldNorseSentenceTokenizationProcess(SentenceTokenizationProcess):
"""
The default Old Norse sentence tokenization algorithm.
>>> from cltk.core.data_types import Process, Pipeline
>>> from cltk.sentence.processes import OldNorseSentenceTokenizationProcess
>>> from cltk.tokenizers import OldNorseTokenizationProcess
>>> from cltk.languages.utils import get_lang
>>> from cltk.languages.example_texts import get_example_text
>>> from cltk.nlp import NLP
>>> pipe = Pipeline(description="A custom Old Norse pipeline", \
processes=[OldNorseTokenizationProcess, OldNorseSentenceTokenizationProcess], \
language=get_lang("non"))
>>> nlp = NLP(language='non', custom_pipeline=pipe, suppress_banner=True)
>>> output_doc = nlp.analyze(get_example_text("non"))
>>> len(output_doc.sentences_strings)
7
"""
@cachedproperty
def algorithm(self):
return OldNorseRegexSentenceTokenizer()