Source code for cltk.ner.processes
"""This module holds the ``Process``es for NER."""
from copy import copy
from dataclasses import dataclass
from typing import Any
from boltons.cacheutils import cachedproperty
from cltk.core.data_types import Doc, Process
from cltk.ner.ner import tag_ner
[docs]@dataclass
class NERProcess(Process):
"""To be inherited for each language's NER declarations.
>>> from cltk.core.data_types import Doc
>>> from cltk.ner.processes import NERProcess
>>> from cltk.core.data_types import Process
>>> issubclass(NERProcess, Process)
True
>>> emb_proc = NERProcess()
"""
language: str = None
@cachedproperty
def algorithm(self):
return tag_ner
[docs] def run(self, input_doc: Doc) -> Doc:
output_doc = copy(input_doc)
ner_obj = self.algorithm
entity_values: list[Any] = ner_obj(
iso_code=self.language, input_tokens=input_doc.tokens
)
for index, word_obj in enumerate(output_doc.words):
word_obj.named_entity = entity_values[index]
output_doc.words[index] = word_obj
return output_doc
[docs]@dataclass
class GreekNERProcess(NERProcess):
"""The default Greek NER algorithm.
.. todo::
Update doctest w/ production model
>>> from cltk.core.data_types import Doc, Word
>>> from cltk.languages.example_texts import get_example_text
>>> from boltons.strutils import split_punct_ws
>>> text = "ἐπὶ δ᾽ οὖν τοῖς πρώτοις τοῖσδε Περικλῆς ὁ Ξανθίππου ᾑρέθη λέγειν. καὶ ἐπειδὴ καιρὸς ἐλάμβανε, προελθὼν ἀπὸ τοῦ σήματος ἐπὶ βῆμα ὑψηλὸν πεποιημένον, ὅπως ἀκούοιτο ὡς ἐπὶ πλεῖστον τοῦ ὁμίλου, ἔλεγε τοιάδε."
>>> tokens = [Word(string=token) for token in split_punct_ws(text)]
>>> a_process = GreekNERProcess()
>>> output_doc = a_process.run(Doc(raw=text, words=tokens))
>>> output_doc.words[7].string
'ὁ'
>>> output_doc.words[7].named_entity
False
>>> output_doc.words[8].string
'Ξανθίππου'
>>> output_doc.words[8].named_entity
False
"""
language: str = "grc"
description: str = "Default NER for Greek."
[docs]@dataclass
class OldEnglishNERProcess(NERProcess):
"""The default OE NER algorithm.
.. todo::
Update doctest w/ production model
>>> from cltk.core.data_types import Doc, Word
>>> from cltk.languages.example_texts import get_example_text
>>> from boltons.strutils import split_punct_ws
>>> text = get_example_text(iso_code="ang")
>>> tokens = [Word(string=token) for token in split_punct_ws(text)]
>>> a_process = OldEnglishNERProcess()
>>> output_doc = a_process.run(Doc(raw=text, words=tokens))
>>> output_doc.words[2].string, output_doc.words[2].named_entity
('Gardena', 'LOCATION')
"""
language: str = "ang"
description: str = "Default NER for Old English."
[docs]@dataclass
class LatinNERProcess(NERProcess):
"""The default Latin NER algorithm.
>>> from cltk.core.data_types import Doc, Word
>>> from cltk.languages.example_texts import get_example_text
>>> from boltons.strutils import split_punct_ws
>>> tokens = [Word(string=token) for token in split_punct_ws(get_example_text("lat"))]
>>> a_process = LatinNERProcess()
>>> output_doc = a_process.run(Doc(raw=get_example_text("lat"), words=tokens))
>>> [word.named_entity for word in output_doc.words][:20]
['LOCATION', False, False, False, False, False, False, False, False, False, 'LOCATION', False, 'LOCATION', False, False, False, False, 'LOCATION', False, 'LOCATION']
"""
language: str = "lat"
description: str = "Default NER for Latin."
[docs]@dataclass
class OldFrenchNERProcess(NERProcess):
"""The default Old French NER algorithm.
>>> from cltk.core.data_types import Doc, Word
>>> from cltk.languages.example_texts import get_example_text
>>> from boltons.strutils import split_punct_ws
>>> tokens = [Word(string=token) for token in split_punct_ws(get_example_text("fro"))]
>>> a_process = OldFrenchNERProcess()
>>> output_doc = a_process.run(Doc(raw=get_example_text("fro"), words=tokens))
>>> output_doc.words[30].string
'Bretaigne'
>>> output_doc.words[30].named_entity
'LOC'
>>> output_doc.words[31].named_entity
False
"""
language: str = "fro"
description: str = "Default NER for Old French."