Source code for cltk.sentence.san
"""Sentence tokenization for Sanskrit.
>>> from cltk.sentence.san import SanskritRegexSentenceTokenizer
>>> from cltk.languages.example_texts import get_example_text
>>> splitter = SanskritRegexSentenceTokenizer()
>>> sentences = splitter.tokenize(get_example_text("san"))
>>> sentences[1]
'तेन त्यक्तेन भुञ्जीथा मा गृधः कस्य स्विद्धनम् ॥'
>>> len(sentences)
12
"""
__author__ = ["Patrick J. Burns <patrick@diyclassics.org>"]
__license__ = "MIT License."
import string
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.sentence.sentence import RegexSentenceTokenizer
[docs]class SanskritLanguageVars(PunktLanguageVars):
sent_end_chars = ["\u0964", "\u0965", "\|", "\|\|"]
[docs]class SanskritRegexSentenceTokenizer(RegexSentenceTokenizer):
"""RegexSentenceTokenizer for Sanskrit."""
def __init__(self):
super().__init__(
language="sanskrit", sent_end_chars=SanskritLanguageVars.sent_end_chars
)