Source code for cltk.alphabet.text_normalization
"""Functions for preprocessing texts. Not language-specific."""
from typing import Optional
from unicodedata import normalize
[docs]def cltk_normalize(text, compatibility=True):
if compatibility:
return normalize("NFKC", text)
else:
return normalize("NFC", text)
[docs]def remove_non_ascii(input_string):
"""Remove non-ascii characters
Source: http://stackoverflow.com/a/1342373
"""
no_ascii = "".join(i for i in input_string if ord(i) < 128)
return no_ascii
[docs]def remove_non_latin(input_string, also_keep=None):
"""Remove non-Latin characters.
`also_keep` should be a list which will add chars (e.g. punctuation)
that will not be filtered.
"""
if also_keep:
also_keep += [" "]
else:
also_keep = [" "]
latin_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
latin_chars += latin_chars.lower()
latin_chars += "".join(also_keep)
no_latin = "".join([char for char in input_string if char in latin_chars])
return no_latin
[docs]def split_trailing_punct(text: str, punctuation: Optional[list[str]] = None) -> str:
"""Some tokenizers, including that in Stanza, do not always
handle punctuation properly. For example, a trailing colon (``"οἶδα:"``)
is not split into an extra punctuation token. This function
does such splitting on raw text before being sent to such
a tokenizer.
Args:
text: Input text string.
punctuation: List of punctuation that should be split when trailing a word.
Returns:
Text string with trailing punctuation separated by a whitespace character.
>>> raw_text = "κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
>>> split_trailing_punct(text=raw_text)
'κατηγόρων ’, οὐκ οἶδα : ἐγὼ δ᾽ οὖν'
"""
if not punctuation:
# What about the curly thing (``᾽``) in eg ``δ᾽``
punctuation = [":", "’", "”"] # closing curly quotes
new_chars: list[str] = list()
for index, char in enumerate(text):
if char in punctuation and index > 0:
# Check whether the punct is attached to a word end
prev_char = text[index - 1]
# If a space already before the punct, then don't add space
if prev_char.isspace():
new_chars.append(char)
else:
# If no whitespace before, then do the split
new_chars.append(f" {char}")
else:
new_chars.append(char)
return "".join(new_chars)
[docs]def split_leading_punct(text: str, punctuation: Optional[list[str]] = None) -> str:
"""Some tokenizers, including that in Stanza, do not always
handle punctuation properly. For example, an open curly
quote (``"‘κατηγόρων’"``) is not split into an extra punctuation
token. This function does such splitting on raw text before
being sent to such a tokenizer.
Args:
text: Input text string.
punctuation: List of punctuation that should be split when before a word.
Returns:
Text string with leading punctuation separated by a whitespace character.
>>> raw_text = "‘κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
>>> split_leading_punct(text=raw_text)
'‘ κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν'
"""
if not punctuation:
punctuation = ["‘", "“"] # opening curly quotes
new_chars: list[str] = list()
last_char_idx = len(text) - 1
for index, char in enumerate(text):
# If at end of string, don't split
if index == last_char_idx:
new_chars.append(char)
continue
next_char = text[index + 1]
# If there is already a whitespace ahead, do not add another
if next_char.isspace():
new_chars.append(char)
continue
else:
if char in punctuation:
new_chars.append(f"{char} ")
else:
new_chars.append(char)
return "".join(new_chars)
[docs]def remove_odd_punct(text: str, punctuation: list[str] = None) -> str:
"""Remove certain characters that downstream processes do
not handle well. It would be better to use ``split_leading_punct()``
and ``split_trailing_punct()``, however the default models
out of Stanza make very strange mistakes when, e.g., ``"‘"``
is made its own token.
What to do about the apostrophe following an elision (e.g.,
``"δ᾽""``)?
>>> raw_text = "‘κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
>>> remove_odd_punct(raw_text)
'κατηγόρων, οὐκ οἶδα ἐγὼ δ᾽ οὖν'
"""
if not punctuation:
punctuation = ["‘", "“", ":", "’", "”"]
chars: list[str] = [char for char in text if char not in punctuation]
return "".join(chars)