Source code for cltk.alphabet.text_normalization

"""Functions for preprocessing texts. Not language-specific."""

from typing import Optional
from unicodedata import normalize


[docs]def cltk_normalize(text, compatibility=True):
    if compatibility:
        return normalize("NFKC", text)
    else:
        return normalize("NFC", text)


[docs]def remove_non_ascii(input_string):
    """Remove non-ascii characters
    Source: http://stackoverflow.com/a/1342373
    """
    no_ascii = "".join(i for i in input_string if ord(i) < 128)
    return no_ascii


[docs]def remove_non_latin(input_string, also_keep=None):
    """Remove non-Latin characters.
    `also_keep` should be a list which will add chars (e.g. punctuation)
    that will not be filtered.
    """
    if also_keep:
        also_keep += [" "]
    else:
        also_keep = [" "]
    latin_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    latin_chars += latin_chars.lower()
    latin_chars += "".join(also_keep)
    no_latin = "".join([char for char in input_string if char in latin_chars])
    return no_latin


[docs]def split_trailing_punct(text: str, punctuation: Optional[list[str]] = None) -> str:
    """Some tokenizers, including that in Stanza, do not always
    handle punctuation properly. For example, a trailing colon (``"οἶδα:"``)
    is not split into an extra punctuation token. This function
    does such splitting on raw text before being sent to such
    a tokenizer.

    Args:
        text: Input text string.
        punctuation: List of punctuation that should be split when trailing a word.

    Returns:
        Text string with trailing punctuation separated by a whitespace character.

    >>> raw_text = "κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
    >>> split_trailing_punct(text=raw_text)
    'κατηγόρων ’, οὐκ οἶδα : ἐγὼ δ᾽ οὖν'
    """
    if not punctuation:
        # What about the curly thing (``᾽``) in eg ``δ᾽``
        punctuation = [":", "’", "”"]  # closing curly quotes
    new_chars: list[str] = list()
    for index, char in enumerate(text):
        if char in punctuation and index > 0:
            # Check whether the punct is attached to a word end
            prev_char = text[index - 1]
            # If a space already before the punct, then don't add space
            if prev_char.isspace():
                new_chars.append(char)
            else:
                # If no whitespace before, then do the split
                new_chars.append(f" {char}")
        else:
            new_chars.append(char)
    return "".join(new_chars)


[docs]def split_leading_punct(text: str, punctuation: Optional[list[str]] = None) -> str:
    """Some tokenizers, including that in Stanza, do not always
    handle punctuation properly. For example, an open curly
    quote  (``"‘κατηγόρων’"``) is not split into an extra punctuation
    token. This function does such splitting on raw text before
    being sent to such a tokenizer.

    Args:
        text: Input text string.
        punctuation: List of punctuation that should be split when before a word.

    Returns:
        Text string with leading punctuation separated by a whitespace character.

    >>> raw_text = "‘κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
    >>> split_leading_punct(text=raw_text)
    '‘ κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν'
    """
    if not punctuation:
        punctuation = ["‘", "“"]  # opening curly quotes
    new_chars: list[str] = list()
    last_char_idx = len(text) - 1
    for index, char in enumerate(text):
        # If at end of string, don't split
        if index == last_char_idx:
            new_chars.append(char)
            continue
        next_char = text[index + 1]
        # If there is already a whitespace ahead, do not add another
        if next_char.isspace():
            new_chars.append(char)
            continue
        else:
            if char in punctuation:
                new_chars.append(f"{char} ")
            else:
                new_chars.append(char)
    return "".join(new_chars)


[docs]def remove_odd_punct(text: str, punctuation: list[str] = None) -> str:
    """Remove certain characters that downstream processes do
    not handle well. It would be better to use ``split_leading_punct()``
    and ``split_trailing_punct()``, however the default models
    out of Stanza make very strange mistakes when, e.g., ``"‘"``
    is made its own token.

    What to do about the apostrophe following an elision (e.g.,
    ``"δ᾽""``)?

    >>> raw_text = "‘κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
    >>> remove_odd_punct(raw_text)
    'κατηγόρων, οὐκ οἶδα ἐγὼ δ᾽ οὖν'
    """
    if not punctuation:
        punctuation = ["‘", "“", ":", "’", "”"]
    chars: list[str] = [char for char in text if char not in punctuation]
    return "".join(chars)
Source code for cltk.alphabet.text_normalization

The Classical Language Toolkit

Navigation

Related Topics