Source code for cltk.sentence.sentence
"""Tokenize sentences."""
__author__ = [
"Patrick J. Burns <patrick@diyclassics.org>",
"Kyle P. Johnson <kyle@kyle-p-johnson.com>",
"Anoop Kunchukuttan <anoop.kunchukuttan@gmail.com>",
]
__license__ = "MIT License. See LICENSE."
import os
import re
from abc import ABC, abstractmethod
from cltk.core import CLTKException
from cltk.utils import CLTK_DATA_DIR
from cltk.utils.file_operations import open_pickle
# from nltk.tokenize.punkt import PunktSentenceTokenizer as NLTKPunktSentenceTokenizer
[docs]class SentenceTokenizer(ABC):
"""Base class for sentences tokenization"""
@abstractmethod
def __init__(self, language: str = None):
"""Initialize stoplist builder with option for language specific parameters.
:param language : language for sentences tokenization
:type language: str
"""
if language:
self.language = language.lower()
self.model = None
self.lang_vars = None
[docs] def tokenize(self, text: str, model=None) -> list[str]:
"""Method for tokenizing sentences with pretrained punkt models; can
be overridden by language-specific tokenizers.
:rtype: list
:param text: text to be tokenized into sentences
:type text: str
:param model: tokenizer object to used # Should be in init?
:type model
"""
if not hasattr(self, "model") or not self.model:
self.model = model
tokenizer = self.model
if not hasattr(tokenizer, "tokenize"):
raise CLTKException("model does not have 'tokenize' method.")
if self.lang_vars:
tokenizer._lang_vars = self.lang_vars
return tokenizer.tokenize(text)
def _get_models_path(self, language): # pragma: no cover
return os.path.join(
CLTK_DATA_DIR,
language,
"model",
f"{language}_models_cltk",
"tokenizers",
"sentence",
)
[docs]class PunktSentenceTokenizer(SentenceTokenizer):
"""Base class for punkt sentences tokenization."""
missing_models_message = "PunktSentenceTokenizer requires a language model."
def __init__(self, language: str = None, lang_vars=None):
"""Constructor.
:param language : language for sentences tokenization
:type language: str
"""
super().__init__(language=language)
if self.language == "lat":
self.language_old = "lat"
self.lang_vars = lang_vars
if self.language:
self.models_path = self._get_models_path(self.language)
try:
self.model = open_pickle(
os.path.join(
os.path.expanduser(self.models_path),
f"{self.language_old}_punkt.pickle",
)
)
except FileNotFoundError as err:
raise type(err)(PunktSentenceTokenizer.missing_models_message)
[docs]class RegexSentenceTokenizer(SentenceTokenizer):
"""Base class for regex sentences tokenization."""
def __init__(self, language: str = None, sent_end_chars: list[str] = None):
"""Constructor.
:param language: language for sentences tokenization
:type language: str
:param sent_end_chars: list of sentences-ending punctuation marks
:type sent_end_chars: list
"""
super().__init__(language)
if sent_end_chars:
self.sent_end_chars = sent_end_chars
self.sent_end_chars_regex = "|".join(self.sent_end_chars)
self.pattern = rf"(?<=[{self.sent_end_chars_regex}])\s"
else:
raise Exception("Must specify sent_end_chars")
[docs] def tokenize(self, text: str, model=None) -> list[str]:
"""Method for tokenizing sentences with regular expressions.
:rtype: list
:param text: text to be tokenized into sentences
:type text: str
"""
sentences = re.split(self.pattern, text)
return sentences