"""A CLTK interface for Sanskrit, Greek and Latin WordNets, built on the NLTK WordNet API
The Sanskrit, Greek and Latin WordNets are lexico-semantic databases for the classical languages inspired by the Princeton WordNet for English. Most directly, these WordNets build on the framework of the Fondazione Bruno Kessler's MultiWordNet Project.
The CLTK WordNet API provides a nearly complete interface to the RESTful API provided by these services and thus provides access to the rich lexical and, especially, semantic information they contain. The WordNets share a common set of semantic descriptors
(synsets) for defining the senses of words, as well as language-specific ones.
The WordNetCorpusReader class is the main entry point for getting information about lemmas, synsets,
and various lexical and semantic (conceptual) relationships such as hypernymy, hyponymy,
synonymy, antonymy etc. It is also possible to compute semantic similarities using several different algorithms.
>>> from cltk.wordnet.wordnet import WordNetCorpusReader
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> uirtus = LWN.lemma('uirtus')
>>> list(uirtus[0].synsets())
[Synset(pos='n', offset='05595229', gloss='feeling no fear'), Synset(pos='n', offset='04504076', gloss='a characteristic property that defines the apparent individual nature of something'), Synset(pos='n', offset='04349777', gloss='possession of the qualities (especially mental qualities) required to do something or get something done; "danger heightened his powers of discrimination"'), Synset(pos='n', offset='04549901', gloss='an ideal of personal excellence toward which a person strives'), Synset(pos='n', offset='03800378', gloss='moral excellence or admirableness'), Synset(pos='n', offset='03800842', gloss='morality with respect to sexual relations'), Synset(pos='n', offset='03805961', gloss='a quality of spirit that enables you to face danger of pain without showing fear'), Synset(pos='n', offset='03929156', gloss='strength of mind that enables one to endure adversity with courage'), Synset(pos='n', offset='03678310', gloss='the trait of being manly; having the characteristics of an adult male'), Synset(pos='n', offset='03806773', gloss='resolute courageousness'), Synset(pos='n', offset='04505328', gloss='something in which something or some one excels'), Synset(pos='n', offset='03806965', gloss='the trait of having a courageous spirit'), Synset(pos='n', offset='03655289', gloss='courageous high-spiritedness'), Synset(pos='n', offset='03808136', gloss='the trait of showing courage and determination in spite of possible loss or injury'), Synset(pos='n', offset='04003047', gloss='the quality that renders something desirable or valuable or useful'), Synset(pos='n', offset='03717355', gloss='a degree or grade of excellence or worth'), Synset(pos='n', offset='04003707', gloss='any admirable quality or attribute'), Synset(pos='n', offset='03798920', gloss='the quality of doing what is right and avoiding what is wrong'), Synset(pos='n', offset='03799068', gloss='a particular moral excellence')]
>>> LWN.synset('n#03457380')
Synset(pos='n', offset='03457380', gloss='a cutting or thrusting weapon with a long blade')
>>> from cltk.wordnet.wordnet import Synset
>>> s1 = Synset(LWN, 'lat', pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s2 = Synset(LWN, 'lat', pos='n', offset='03457380', gloss='a cutting or thrusting weapon with a long blade')
>>> s1.lowest_common_hypernyms(s2)
[Synset(pos='n', offset='03601056', gloss='weaponry used in fighting or hunting')]
>>> s1.shortest_path_distance(s2)
3
>>> s1.wup_similarity(s2)
0.8
"""
from __future__ import print_function, unicode_literals
import codecs
import logging
import math
import os
import re
import string
from collections import defaultdict, deque
from functools import total_ordering
from itertools import chain
from operator import itemgetter
import requests
from nltk.corpus.reader import CorpusReader
from nltk.probability import FreqDist
from cltk.utils import get_cltk_data_dir
logger = logging.getLogger(__name__)
nesteddict = lambda: defaultdict(nesteddict)
punctuation = str.maketrans("", "", string.punctuation)
######################################################################
# Table of Contents
######################################################################
# - Constants
# - Data Classes
# - WordNetError
# - Lemma
# - Synset
# - WordNet Corpus Reader
# - WordNet Information Content Corpus Reader
# - Similarity Metrics
# - Demo
######################################################################
# Constants
######################################################################
#: Positive infinity (for similarity functions)
_INF = 1e300
# { Part-of-speech constants
ADJ, ADV, NOUN, VERB, PREP = "a", "r", "n", "v", "p"
# }
POS_LIST = [NOUN, VERB, ADJ, ADV, PREP]
SENSENUM_RE = re.compile(r"^([nvarp])#(\w+)$")
######################################################################
# Data Classes
######################################################################
[docs]class WordNetError(Exception):
"""An exception class for WordNet-related errors."""
[docs]class _WordNetObject(object):
"""A common base class for lemmas and synsets.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> sub = Lemma(LWN, lemma='sub', pos='r', morpho='rp--------', uri='37096')
>>> 'super' in [lemma.lemma() for lemma in sub.antonyms()]
True
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s1.hypernyms()
[Synset(pos='n', offset='02893681', gloss='a weapon with a handle and blade with a sharp point')]
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s1.hyponyms()
[Synset(pos='n', offset='02575932', gloss='(Scottish) a long straight-bladed dagger'), Synset(pos='n', offset='03155758', gloss='a dagger with a slender blade'), Synset(pos='n', offset='03413564', gloss='a small dagger with a tapered blade')]
>>> s1 = LWN.synset_from_pos_and_offset('n', '00510771')
>>> s1.member_meronyms()
[Synset(pos='n', offset='07260585', gloss='a supporter of feminism')]
>>> s1 = LWN.synset_from_pos_and_offset('n', '02335723')
>>> s1.substance_meronyms()
[Synset(pos='n', offset='10626993', gloss='soil that is plastic when moist but hard when fired')]
>>> s1 = LWN.synset_from_pos_and_offset('n', '00541686')
>>> s1.attributes()
[Synset(pos='a', offset='01151057', gloss='sexually attracted to members of the opposite sex'), Synset(pos='a', offset='01151299', gloss='sexually attracted to members of your own sex')]
>>> s1 = LWN.synset_from_pos_and_offset('n', '00077986')
>>> s1.part_meronyms()
[Synset(pos='n', offset='00078772', gloss='preparation for the delivery of shellfire on a target')]
>>> s1 = LWN.synset_from_pos_and_offset('v', '00107243')
>>> s1.also_sees()
[Synset(pos='v', offset='00293275', gloss='become looser or slack')]
>>> s1 = LWN.synset_from_pos_and_offset('v', '00001740')
>>> s1.entailments()
[Synset(pos='v', offset='00003142', gloss='expel air'), Synset(pos='v', offset='00003763', gloss='draw in air')]
>>> s1 = LWN.synset_from_pos_and_offset('v', '00014590')
>>> s1.causes()
[Synset(pos='v', offset='00009805', gloss='be asleep')]
>>> s1 = LWN.synset_from_pos_and_offset('v', '00051515')
>>> s1.verb_groups()
[Synset(pos='v', offset='00050470', gloss='eliminate urine')]
>>> s1 = LWN.synset_from_pos_and_offset('n', 'L9083855')
>>> s1.nearest()
[Synset(pos='n', offset='03543592', gloss='ship for transporting troops')]
"""
[docs] def antonyms(self):
""""""
return self.related("!")
[docs] def hypernyms(self):
""""""
return self.related("@")
def _hypernyms(self):
return self.related("@")
[docs] def hyponyms(self):
""""""
return self.related("~")
[docs] def member_holonyms(self): # pragma: no cover
return self.related("#m")
[docs] def substance_holonyms(self): # pragma: no cover
return self.related("#s")
[docs] def part_holonyms(self): # pragma: no cover
return self.related("#p")
[docs] def member_meronyms(self):
""""""
return self.related("%m")
[docs] def substance_meronyms(self):
""""""
return self.related("%s")
[docs] def part_meronyms(self):
""""""
return self.related("%p")
[docs] def attributes(self):
""""""
return self.related("=")
[docs] def entailments(self):
""""""
return self.related("*")
[docs] def causes(self):
""""""
return self.related(">")
[docs] def also_sees(self):
""""""
return self.related("^")
[docs] def verb_groups(self):
""""""
return self.related("$")
[docs] def similar_tos(self):
return self.related("&")
[docs] def nearest(self):
""""""
return self.related("|")
[docs]@total_ordering
class Lemma(_WordNetObject):
"""The lexical entry for a single morphological form of a
sense-disambiguated word.
Create a Lemma from lemma, pos, and morpho, or uri parameters where:
<lemma> is the morphological form identifying the lemma
<pos> is one of the module attributes 'n', 'v', 'a' or 'r'
<morpho> is the morphological descriptor
<uri> is the URI
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> animus = Lemma(LWN, lemma='animus', pos='n', morpho='n-s---mn2-', uri='a2046')
>>> print(animus)
Lemma(lemma='animus', pos='n', morpho='n-s---mn2-', uri='a2046')
>>> virtus = Lemma(LWN, lemma='uirtus', pos='n', morpho='n-s---fn3-', uri='u0800')
>>> print(virtus)
Lemma(lemma='uirtus', pos='n', morpho='n-s---fn3-', uri='u0800')
Lemma attributes, accessible via methods with the same name:
- lemma: The canonical form of this lemma
- synsets: The synsets that this lemma belongs to
- literal: The synsets that this lemma belongs to in virtue of its literal senses
- metonymic: The synsets that this lemma belongs to in virtue of its metonymic senses
- metaphoric: The synsets that this lemma belongs to in virtue of its metaphoric senses
- count: The frequency of this lemma in the WordNet, i.e., the number of synsets
(literal, metonymic, or metaphoric) to which it belongs
Lemma methods:
Lemmas have the following methods for retrieving related Lemmas. They
correspond to the names for the pointer symbols defined here:
https://wordnet.princeton.edu/documentation/wninput5wn
These methods all return lists of Lemmas:
- antonyms
- hypernyms
- hyponyms
- member_holonyms, substance_holonyms, part_holonyms
- member_meronyms, substance_meronyms, part_meronyms
- attributes
- derivationally_related_forms
- entailments
- causes
- also_sees
- verb_groups
- similar_tos
- pertainyms
"""
__slots__ = [
"_wordnet_corpus_reader",
"_lemma",
"_pos",
"_morpho",
"__synsets",
"__related",
"_literal",
"_metonymic",
"_metaphoric",
"_uri",
"_lang",
]
def __init__(self, wordnet_corpus_reader, lemma, pos, morpho, uri, **kwargs):
self._wordnet_corpus_reader = wordnet_corpus_reader
self._lemma = lemma
self._pos = pos
self._morpho = morpho
self._uri = uri
self.__synsets = None
self.__related = None
[docs] def uri(self):
"""URI.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> metus = Lemma(LWN, lemma='metus', pos='n', morpho='n-s---mn4-', uri='m0918')
>>> metus.uri()
'm0918'
"""
return self._uri
[docs] def lemma(self):
"""Lemma.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> metus = Lemma(LWN, lemma='metus', pos='n', morpho='n-s---mn4-', uri='m0918')
>>> metus.lemma()
'metus'
"""
return self._lemma
[docs] def pos(self):
"""POS.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> metus = Lemma(LWN, lemma='metus', pos='n', morpho='n-s---mn4-', uri='m0918')
>>> metus.pos()
'n'
"""
return self._pos
[docs] def morpho(self):
"""Morpho.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> metus = Lemma(LWN, lemma='metus', pos='n', morpho='n-s---mn4-', uri='m0918')
>>> metus.morpho()
'n-s---mn4-'
"""
return self._morpho
@property
def _related(self):
if self.__related is None:
if not (self.lemma() and self.pos() and self.morpho()):
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/uri/{self.uri()}/relations/?format=json",
timeout=(30.0, 90.0),
).json()["results"]
else:
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/lemmas/{self.lemma()}/{self.pos() if self.pos() else '*'}"
f"/{self.morpho() if self.morpho() else '*'}/relations/?format=json",
timeout=(30.0, 90.0),
).json()["results"]
if len(results) > 1:
if not self._wordnet_corpus_reader._ignore_errors:
ambiguous = [
f"{result['lemma']['lemma']} ({result['lemma']['morpho']})"
for result in results
]
raise WordNetError(f"can't disambiguate {', '.join(ambiguous)}")
else:
self.__related = results[0]["relations"]
return self.__related
@property
def _synsets(self):
if self.__synsets is None:
if not (self.lemma() and self.pos() and self.morpho()):
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/uri/{self.uri()}/synsets/?format=json",
timeout=(30.0, 90.0),
).json()
else:
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/lemmas/{self.lemma()}/"
f"{self.pos() if self.pos() else '*'}/{self.morpho() if self.morpho() else '*'}/synsets/?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()["results"]
if len(data) > 1:
if not self._wordnet_corpus_reader._ignore_errors:
ambiguous = [
f"{result['lemma']} ({result['morpho']})" for result in data
]
raise WordNetError(f"can't disambiguate {', '.join(ambiguous)}")
else:
self.__synsets = data[0]["synsets"]
return self.__synsets
[docs] def synsets(self):
"""Retrieve all synsets for the lemma.
:return: A generator of Synset objects.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> virtus = LWN.lemmas_from_uri('u0800')[0]
>>> synset = list(virtus.synsets())[0]
>>> print(synset.gloss())
feeling no fear
"""
return chain(self.literal(), self.metonymic(), self.metaphoric())
[docs] def literal(self):
"""Retrieve all literal senses of the lemma.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> virtus = LWN.lemmas_from_uri('u0800')[0]
>>> list(virtus.literal())
[Synset(pos='n', offset='05595229', gloss='feeling no fear'), Synset(pos='n', offset='04504076', gloss='a characteristic property that defines the apparent individual nature of something'), Synset(pos='n', offset='04349777', gloss='possession of the qualities (especially mental qualities) required to do something or get something done; "danger heightened his powers of discrimination"'), Synset(pos='n', offset='04549901', gloss='an ideal of personal excellence toward which a person strives'), Synset(pos='n', offset='03800378', gloss='moral excellence or admirableness'), Synset(pos='n', offset='03800842', gloss='morality with respect to sexual relations'), Synset(pos='n', offset='03805961', gloss='a quality of spirit that enables you to face danger of pain without showing fear'), Synset(pos='n', offset='03929156', gloss='strength of mind that enables one to endure adversity with courage'), Synset(pos='n', offset='03678310', gloss='the trait of being manly; having the characteristics of an adult male'), Synset(pos='n', offset='03806773', gloss='resolute courageousness'), Synset(pos='n', offset='04505328', gloss='something in which something or some one excels'), Synset(pos='n', offset='03806965', gloss='the trait of having a courageous spirit'), Synset(pos='n', offset='03655289', gloss='courageous high-spiritedness'), Synset(pos='n', offset='03808136', gloss='the trait of showing courage and determination in spite of possible loss or injury'), Synset(pos='n', offset='04003047', gloss='the quality that renders something desirable or valuable or useful'), Synset(pos='n', offset='03717355', gloss='a degree or grade of excellence or worth'), Synset(pos='n', offset='04003707', gloss='any admirable quality or attribute'), Synset(pos='n', offset='03798920', gloss='the quality of doing what is right and avoiding what is wrong'), Synset(pos='n', offset='03799068', gloss='a particular moral excellence')]
"""
return (
Synset(
self._wordnet_corpus_reader,
synset["language"],
synset["pos"],
synset["offset"],
synset["gloss"],
)
for synset in self._synsets["literal"]
)
[docs] def metonymic(self):
"""Retrieve all metonymic senses of the lemma.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> baculum = LWN.lemma('baculum', 'n', 'n-s---nn2-')
>>> list(baculum[0].metonymic())
[Synset(pos='n', offset='02327416', gloss='a support that steadies or strengthens something else'), Synset(pos='n', offset='02531456', gloss='used as a weapon'), Synset(pos='n', offset='03444976', gloss='any device that bears the weight of another thing')]
"""
return (
Synset(
self._wordnet_corpus_reader,
synset["language"],
synset["pos"],
synset["offset"],
synset["gloss"],
)
for synset in self._synsets["metonymic"]
)
[docs] def pertainyms(self):
"""Pertainyms.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> abalienatio = LWN.lemma('abalienatio', 'n', 'n-s---fn3-')
>>> abalienatio
[Lemma(lemma='abalienatio', pos='n', morpho='n-s---fn3-', uri='a0014')]
>>> list(abalienatio[0].pertainyms())
[Lemma(lemma='abalieno', pos='v', morpho='v1spia--1-', uri='a0015'), Lemma(lemma='ab', pos='p', morpho='p---------', uri='a0001')]
"""
return self.related("\\")
[docs] def participle(self):
return self.related("<")
[docs] def composed_of(self):
"""Composed of.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> evoco = LWN.lemma('euoco', 'v', 'v1spia--1-')
>>> list(evoco[0].composed_of())
[Lemma(lemma='uoco', pos='v', morpho='v1spia--1-', uri='u1152'), Lemma(lemma='ex', pos='p', morpho='p---------', uri='e1167')]
"""
return self.related("+c")
[docs] def composes(self):
"""Composes.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> voco = LWN.lemma('uoco', 'v', 'v1spia--1-')
>>> list(voco[0].composes())
[Lemma(lemma='euoco', pos='v', morpho='v1spia--1-', uri='e1117'), Lemma(lemma='conuoco', pos='v', morpho='v1spia--1-', uri='c3931'), Lemma(lemma='prouoco', pos='v', morpho='v1spia--1-', uri='p4232'), Lemma(lemma='inuoco', pos='v', morpho='v1spia--1-', uri='i2733'), Lemma(lemma='reuoco', pos='v', morpho='v1spia--1-', uri='r1447')]
"""
return self.related("-c")
def __repr__(self):
return "Lemma(lemma='{}', pos='{}', morpho='{}', uri='{}')".format(
self.lemma(), self.pos(), self.morpho(), self.uri()
)
def __hash__(self):
return hash(self._lemma)
def __eq__(self, other):
return (
self._lemma == other._lemma
and self._pos == other._pos
and self._morpho == other._morpho
and self._uri == other._uri
)
def __ne__(self, other):
return not self == other
def __lt__(self, other):
return self._lemma < other._lemma
[docs]class Semfield:
"""
Create a Semfield from code and english parameters where:
<code> is the semfield's DDCS code
<english> is the semfield's DDCS descriptor
A semfield (semantic field) defines a broad conceptual domain that includes
many synsets. The Latin WordNet uses the Dewey Decimal Classification System
as a topic index and hierarchy.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> anatomy = Semfield(LWN, '611', "Human Anatomy, Cytology & Histology")
"""
__slots__ = [
"_wordnet_corpus_reader",
"_code",
"_english",
"_synsets",
"_lemmas",
"_hypers",
"_hypons",
]
def __init__(self, wordnet_corpus_reader, code, english=None):
self._wordnet_corpus_reader = wordnet_corpus_reader
self._code = code
self._english = english
self._synsets = None
self._lemmas = None
self._hypers = None
self._hypons = None
[docs] def code(self):
return self._code
[docs] def english(self):
if self._english is None: # pragma: no cover
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/semfields/{self.code()}/?format=json",
timeout=(30.0, 90.0),
)
if results:
if len(results.json()) > 1:
if self._wordnet_corpus_reader._ignore_errors:
ambiguous = [f"'{semfield['english']}'" for semfield in results]
raise WordNetError(f"can't disambiguate {', '.join(ambiguous)}")
else:
self._english = results.json()[0]["english"]
return self._english
[docs] def synsets(self):
"""Retrieve all synsets of the semfield.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> anatomy = Semfield(LWN, '611', "Human anatomy, cytology & histology")
>>> fat = LWN.synset('n#04089143')
>>> print(fat in list(anatomy.synsets()))
True
"""
if self._synsets is None:
english = re.sub(" ", "_", self.english())
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/semfields/{self.code()}/{english}/synsets/?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()["results"]
self._synsets = (
Synset(
self._wordnet_corpus_reader,
synset["language"],
synset["pos"],
synset["offset"],
synset["gloss"],
)
for synset in data[0]["synsets"]
)
else:
self._synsets = []
return self._synsets
[docs] def lemmas(self):
"""Retrieve all lemmas for all synsets of the semfield.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> anatomy = Semfield(LWN, '611', "Human anatomy, cytology & histology")
>>> list(anatomy.lemmas())[0]
Lemma(lemma='autopsia', pos='n', morpho='n-s---fn1-', uri='50882')
"""
if self._lemmas is None:
english = re.sub(" ", "_", self.english())
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/semfields/{self.code()}/{english}/lemmas/?format=json",
timeout=(30.0, 90.0),
)
if results:
self._lemmas = list(
Lemma(
self._wordnet_corpus_reader,
lemma["lemma"],
lemma["pos"],
lemma["morpho"],
lemma["uri"],
)
for lemma in results.json()["results"][0]["lemmas"]
)
else:
self._lemmas = []
return self._lemmas
[docs] def hypers(self):
"""Retrieve all superordinate semfields of the semfield.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> anatomy = Semfield(LWN, '611', "Human anatomy, cytology & histology")
>>> print(list(anatomy.hypers()))
[Semfield(code='610', english='Medicine & Health')]
"""
if self._hypers is None:
english = re.sub(" ", "_", self.english())
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/semfields/{self.code()}/{english}/?format=json",
timeout=(30.0, 90.0),
)
if results:
self._hypers = (
Semfield(
self._wordnet_corpus_reader,
semfield["code"],
semfield["english"],
)
for semfield in results.json()["results"][0]["hypers"]
)
else:
self._hypers = []
return self._hypers
[docs] def hypons(self):
"""Retrieve all subordinate semfields of the semfield.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> medicine = Semfield(LWN, '610', "Medicine & Health")
>>> print(list(medicine.hypons()))
[Semfield(code='610', english='Medicine & health'), Semfield(code='611', english='Human anatomy, cytology & histology'), Semfield(code='612', english='Human Physiology'), Semfield(code='613', english='Personal Health & Safety'), Semfield(code='614', english='Incidence & prevention of disease'), Semfield(code='615', english='Pharmacology & therapeutics'), Semfield(code='616', english='Diseases'), Semfield(code='617', english='Surgery & Related Medical Specialties'), Semfield(code='618', english='Gynecology, Obstetrics, Pediatrics & Geriatrics')]
"""
if self._hypons is None:
english = re.sub(" ", "_", self.english())
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/semfields/{self.code()}/{english}/?format=json",
timeout=(30.0, 90.0),
)
if results:
self._hypons = sorted(
[
Semfield(
self._wordnet_corpus_reader,
semfield["code"],
semfield["english"],
)
for semfield in results.json()["results"][0]["hypons"]
],
key=lambda x: x.code(),
)
else:
self._hypons = []
return self._hypons
def __repr__(self):
return "Semfield(code='{}', english='{}')".format(self.code(), self.english())
[docs]@total_ordering
class Synset(_WordNetObject):
"""Create a Synset from pos, offset and gloss parameters where:
<pos> is the synset's part of speech
<offset> is the offset ID of the synset
<gloss> is the synset's gloss
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> print(s1.id())
n#02542418
Synset attributes, accessible via methods with the same name:
- pos: The synset's part of speech, 'n', 'v', 'a', or 'r'
- offset: The unique offset ID of the synset
- lemmas: A list of the Lemma objects for this synset
- gloss: The gloss for this synset
Synset methods:
Synsets have the following methods for retrieving related Synsets.
They correspond to the names for the pointer symbols defined here:
https://wordnet.princeton.edu/documentation/wninput5wn
These methods all return lists of Synsets.
- hypernyms
- hyponyms
- member_holonyms, substance_holonyms, part_holonyms
- member_meronyms, substance_meronyms, part_meronyms
- attributes
- entailments
- causes
- also_sees
- verb_groups
- similar_tos
- nearest
Additionally, Synsets support the following methods specific to the
hypernym relation:
- root_hypernyms
- common_hypernyms
- lowest_common_hypernyms
Note that Synsets do not support the following relations because
these are defined by WordNet as lexical relations:
- derivationally_related_forms
- pertainyms
- composed_of
- composes
- participle
"""
__slots__ = [
"_pos",
"_offset",
"_lemmas",
"_gloss",
"_semfields",
"_sentiment",
"__related",
"_max_depth",
"_min_depth",
"_all_hypernyms",
]
def __init__(
self, wordnet_corpus_reader, language, pos, offset, gloss, semfield=None
):
self._wordnet_corpus_reader = wordnet_corpus_reader
self._language = language
self._pos = pos
self._offset = offset
self._gloss = gloss.split(":")[0]
self._examples = None
self._lemmas = None
self.__related = None
self._semfields = None
self._sentiment = None
self._all_hypernyms = None
[docs] def id(self):
return "{}#{}".format(self.pos(), self.offset())
[docs] def semfields(self):
"""Retrieve the synset's semfields.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset('n', 'L6992236')
>>> list(s1.semfields())
[Semfield(code='150', english='Psychology')]
"""
if self._semfields is None:
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/synsets/{self.pos()}/{self.offset()}/?format=json",
timeout=(30.0, 90.0),
)
if results:
self._semfields = results.json()["results"][0]["semfield"]
else:
self._semfields = []
return (
Semfield(self._wordnet_corpus_reader, semfield["code"], semfield["english"])
for semfield in self._semfields
)
[docs] def sentiment(self):
"""Retrieve sentiment scores for the synset.
:return: A dict including the synset's positivity, negativity, and objectivity scores (-1 to 1).
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset('v', '01215448')
>>> s1.sentiment()
{'positivity': 0.0, 'negativity': 0.625, 'objectivity': 0.375}
"""
if self._sentiment is None:
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/synsets/{self.pos()}/{self.offset()}/sentiment/?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()["results"]
self._sentiment = data[0]["sentiment"]
return self._sentiment
[docs] def positivity(self):
"""Positivity.
:return: An integer value representing the synset's positivity score.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset('v', '01215448')
>>> s1.positivity()
0.0
"""
if self._sentiment is None:
self.sentiment()
return self._sentiment["positivity"]
[docs] def negativity(self):
"""Negativity.
:return: An integer value representing the synset's negativity score.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset('v', '01215448')
>>> s1.negativity()
0.625
"""
if self._sentiment is None:
self.sentiment()
return self._sentiment["negativity"]
[docs] def objectivity(self):
"""Objectivity.
:return: An integer value representing the synset's objectivity score.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset('v', '01215448')
>>> s1.objectivity()
0.375
"""
if self._sentiment is None:
self.sentiment()
return self._sentiment["objectivity"]
[docs] def language(self):
return self._language
[docs] def pos(self):
return self._pos
[docs] def offset(self):
return self._offset
[docs] def gloss(self):
return self._gloss
[docs] def examples(self):
"""Retrieve examples of any lemma instantiating this synset.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset('n', '04399253')
>>> print(s1.examples()[0])
{'lemma': {'lemma': 'baculum', 'pos': 'n', 'morpho': 'n-s---nn2-', 'uri': 'b0034', 'prosody': 'baculum'}, 'author_abbr': 'Vulg', 'work_abbr': 'Tob', 'reference': '10.4', 'text': 'baculum senectutis nostrae'}
"""
if self._examples is None:
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/synsets/{self.pos()}/{self.offset()}/examples/?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()["results"]
self._examples = data[0]["examples"]
return self._examples
def _needs_root(self):
return self._pos == "n" or self._pos == "v"
[docs] def lemmas(self):
"""Return all the Lemma objects associated with the synset.
:return: A generator of Lemma objects.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> for lemma in sorted(set(s1.lemmas())):
... print(lemma.lemma())
clunaculum
gladiolus
parazonium
pugio
pugiunculus
sica
sicula
"""
if self._lemmas is None:
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/synsets/{self.pos()}/{self.offset()}/lemmas/?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()["results"]
self._lemmas = data[0]["lemmas"]
else:
self._lemmas = []
return (
Lemma(
self._wordnet_corpus_reader,
lemma["lemma"],
lemma["pos"],
lemma["morpho"],
lemma["uri"],
)
for sense_type in self._lemmas
for lemma in self._lemmas[sense_type]
)
[docs] def root_hypernyms(self):
"""et the topmost hypernyms of this synset in WordNet.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s1.root_hypernyms()
[Synset(pos='n', offset='00001740', gloss='anything having existence (living or nonliving)')]
"""
result = []
seen = set()
todo = [self]
while todo:
next_synset = todo.pop()
if next_synset not in seen:
seen.add(next_synset)
next_hypernyms = next_synset.hypernyms()
if not next_hypernyms:
result.append(next_synset)
else:
todo.extend(next_hypernyms)
return result
[docs] def max_depth(self):
"""Get the length of the longest hypernym path from this synset to the root.
:return: An integer value representing the maximum path length to the root.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s1.max_depth()
7
"""
if "_max_depth" not in self.__dict__:
hypernyms = self.hypernyms()
if not hypernyms:
self._max_depth = 0
else:
self._max_depth = 1 + max(h.max_depth() for h in hypernyms)
return self._max_depth
[docs] def min_depth(self):
"""Get min depth.
:return: The length of the shortest hypernym path from this
synset to the root.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s1.min_depth()
7
"""
if "_min_depth" not in self.__dict__:
hypernyms = self.hypernyms()
if not hypernyms:
self._min_depth = 0
else:
self._min_depth = 1 + min(h.min_depth() for h in hypernyms)
return self._min_depth
[docs] def closure(self, rel, depth=-1):
"""Return the transitive closure of the synset under the rel
relationship, breadth-first.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> hypers = lambda s: s.hypernyms()
>>> list(s1.closure(hypers))
[Synset(pos='n', offset='02893681', gloss='a weapon with a handle and blade with a sharp point'), Synset(pos='n', offset='03601056', gloss='weaponry used in fighting or hunting'), Synset(pos='n', offset='03601456', gloss='weapons considered collectively'), Synset(pos='n', offset='02859872', gloss='an artifact (or system of artifacts) that is instrumental in accomplishing some end'), Synset(pos='n', offset='00011937', gloss='a man-made object'), Synset(pos='n', offset='00009457', gloss='a physical (tangible and visible) entity'), Synset(pos='n', offset='00001740', gloss='anything having existence (living or nonliving)')]
"""
from nltk.util import breadth_first
synset_ids = []
for synset in breadth_first(self, rel, depth):
if synset.id() != self.id():
if synset.id() not in synset_ids:
synset_ids.append(synset.id())
yield synset
[docs] def hypernym_paths(self):
"""Get the path(s) from this synset to the root, where each path is a
list of the synset nodes traversed on the way to the root.
:return: A list of lists, where each list gives the node sequence
connecting the initial ``Synset`` node and a root node.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s1.hypernym_paths()
[[Synset(pos='n', offset='00001740', gloss='anything having existence (living or nonliving)'), Synset(pos='n', offset='00009457', gloss='a physical (tangible and visible) entity'), Synset(pos='n', offset='00011937', gloss='a man-made object'), Synset(pos='n', offset='02859872', gloss='an artifact (or system of artifacts) that is instrumental in accomplishing some end'), Synset(pos='n', offset='03601456', gloss='weapons considered collectively'), Synset(pos='n', offset='03601056', gloss='weaponry used in fighting or hunting'), Synset(pos='n', offset='02893681', gloss='a weapon with a handle and blade with a sharp point'), Synset(pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')]]
"""
paths = []
hypernyms = self.hypernyms()
if len(hypernyms) == 0:
paths = [[self]]
for hypernym in hypernyms:
for ancestor_list in hypernym.hypernym_paths():
ancestor_list.append(self)
paths.append(ancestor_list)
return paths
[docs] def common_hypernyms(self, other):
"""Find all synsets that are hypernyms of this synset and the
other synset.
:type other: Synset
:param other: other input synset.
:return: The synsets that are hypernyms of both synsets.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s2 = Synset(LWN, None, pos='n', offset='03457380', gloss='a cutting or thrusting weapon with a long blade')
>>> sorted(s1.common_hypernyms(s2))
[Synset(pos='n', offset='00001740', gloss='anything having existence (living or nonliving)'), Synset(pos='n', offset='00009457', gloss='a physical (tangible and visible) entity'), Synset(pos='n', offset='00011937', gloss='a man-made object'), Synset(pos='n', offset='02859872', gloss='an artifact (or system of artifacts) that is instrumental in accomplishing some end'), Synset(pos='n', offset='03601056', gloss='weaponry used in fighting or hunting'), Synset(pos='n', offset='03601456', gloss='weapons considered collectively')]
"""
if not self._all_hypernyms:
self._all_hypernyms = set(
self_synset
for self_synsets in self._iter_hypernym_lists()
for self_synset in self_synsets
)
if not other._all_hypernyms:
other._all_hypernyms = set(
other_synset
for other_synsets in other._iter_hypernym_lists()
for other_synset in other_synsets
)
return list(self._all_hypernyms.intersection(other._all_hypernyms))
[docs] def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
"""Get a list of lowest synset(s) that both synsets have as a hypernym.
When `use_min_depth == False` this means that the synset which appears
as a hypernym of both `self` and `other` with the lowest maximum depth
is returned or if there are multiple such synsets at the same depth
they are all returned
However, if `use_min_depth == True` then the synset(s) which has/have
the lowest minimum depth and appear(s) in both paths is/are returned.
:type other: Synset
:param other: other input synset
:type simulate_root: bool
:param simulate_root: The various verb taxonomies do not
share a single root which disallows this metric from working for
synsets that are not connected. This flag (False by default)
creates a fake root that connects all the taxonomies. Set it
to True to enable this behavior. For the noun taxonomy,
there is usually a default root except for WordNet version 1.6.
If you are using wordnet 1.6, a fake root will need to be added
for nouns as well.
:type use_min_depth: bool
:param use_min_depth: This setting mimics older (v2) behavior of NLTK
wordnet If True, will use the min_depth function to calculate the
lowest common hypernyms. This is known to give strange results for
some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained
for backwards compatibility
:return: The synsets that are the lowest common hypernyms of both
synsets
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s2 = Synset(LWN, None, pos='n', offset='03457380', gloss='a cutting or thrusting weapon with a long blade')
>>> s1.lowest_common_hypernyms(s2)
[Synset(pos='n', offset='03601056', gloss='weaponry used in fighting or hunting')]
"""
synsets = self.common_hypernyms(other)
if simulate_root:
root = Synset(self._wordnet_corpus_reader, None, self.pos(), "00000000", "")
synsets.append(root)
try:
if use_min_depth:
max_depth = max(s.min_depth() for s in synsets)
unsorted_lch = [s for s in synsets if s.min_depth() == max_depth]
else:
max_depth = max(s.max_depth() for s in synsets)
unsorted_lch = [s for s in synsets if s.max_depth() == max_depth]
return sorted(unsorted_lch)
except ValueError:
return []
[docs] def hypernym_distances(self, distance=0, simulate_root=False):
"""Get the path(s) from this synset to the root, counting the distance
of each node from the initial node on the way. A set of
(synset, distance) tuples is returned.
:type distance: int
:param distance: the distance (number of edges) from this hypernym to
the original hypernym ``Synset`` on which this method was called.
:return: A set of ``(Synset, int)`` tuples where each ``Synset`` is
a hypernym of the first ``Synset``.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> sorted(s1.hypernym_distances())
[(Synset(pos='n', offset='00001740', gloss='anything having existence (living or nonliving)'), 7), (Synset(pos='n', offset='00009457', gloss='a physical (tangible and visible) entity'), 6), (Synset(pos='n', offset='00011937', gloss='a man-made object'), 5), (Synset(pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade'), 0), (Synset(pos='n', offset='02859872', gloss='an artifact (or system of artifacts) that is instrumental in accomplishing some end'), 4), (Synset(pos='n', offset='02893681', gloss='a weapon with a handle and blade with a sharp point'), 1), (Synset(pos='n', offset='03601056', gloss='weaponry used in fighting or hunting'), 2), (Synset(pos='n', offset='03601456', gloss='weapons considered collectively'), 3)]
"""
distances = set([(self, distance)])
for hypernym in self._hypernyms():
distances |= set(
hypernym.hypernym_distances(distance + 1, simulate_root=False)
)
if simulate_root:
root = Synset(self._wordnet_corpus_reader, self.pos(), "00000000")
root_distance = max(distances, key=itemgetter(1))[1]
distances.add((root, root_distance + 1))
return list(distances)
def _shortest_hypernym_paths(self, simulate_root):
if self.offset == "00000000":
return {self: 0}
queue = deque([(self, 0)])
path = {}
while queue:
s, depth = queue.popleft()
if s in path:
continue
path[s] = depth
depth += 1
queue.extend((hyp, depth) for hyp in s._hypernyms())
if simulate_root:
root = Synset(self._wordnet_corpus_reader, None, self.pos(), "00000000", "")
path[root] = max(path.values()) + 1
return path
[docs] def shortest_path_distance(self, other, simulate_root=False):
"""Returns the distance of the shortest path linking the two synsets (if
one exists). For each synset, all the ancestor nodes and their
distances are recorded and compared. The ancestor node common to both
synsets that can be reached with the minimum number of traversals is
used. If no ancestor nodes are common, None is returned. If a node is
compared with itself 0 is returned.
:type other: Synset
:param other: The Synset to which the shortest path will be found.
:return: The number of edges in the shortest path connecting the two
nodes, or None if no path exists.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s2 = Synset(LWN, None, pos='n', offset='03457380', gloss='a cutting or thrusting weapon with a long blade')
>>> s1.shortest_path_distance(s2)
3
"""
if self == other:
return 0
dist_dict1 = self._shortest_hypernym_paths(simulate_root)
dist_dict2 = other._shortest_hypernym_paths(simulate_root)
# For each ancestor synset common to both subject synsets, find the
# connecting path length. Return the shortest of these.
inf = float("inf")
path_distance = inf
for synset, d1 in dist_dict1.items():
d2 = dist_dict2.get(synset, inf)
path_distance = min(path_distance, d1 + d2)
return None if math.isinf(path_distance) else path_distance
[docs] def tree(self, rel, depth=-1, cut_mark=None):
"""Generate a tree-like list structure for rel relationship of this synset.
:param rel: A function returning the relations of a certain kind of this synset.
:param depth:
:param cut_mark: An object used to indicate where a branch has been truncated.
:return: A list of lists.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset(pos='n', offset='01595188')
>>> hypers = lambda s: s.hypernyms()
>>> s1.tree(hypers)
[Synset(pos='n', offset='01595188', gloss='a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds; "the dog barked all night"'), [Synset(pos='n', offset='01594481', gloss='any of various fissiped mammals with nonretractile claws and typically long muzzles'), [Synset(pos='n', offset='01586585', gloss='terrestrial or aquatic flesh-eating mammal; terrestrial carnivores have four or five clawed digits on each limb'), [Synset(pos='n', offset='01402712', gloss='mammals having a placenta; all mammals except monotremes and marsupials'), [Synset(pos='n', offset='01378363', gloss='any warm-blooded vertebrate having the skin more or less covered with hair; young are born alive except for the small subclass of monotremes and nourished with milk'), [Synset(pos='n', offset='00995974', gloss='animals having a bony or cartilaginous skeleton with a segmented spinal column and a large brain enclosed in a skull or cranium'), [Synset(pos='n', offset='00990770', gloss='any animal of the phylum Chordata having a notochord or spinal column'), [Synset(pos='n', offset='00008019', gloss='a living organism characterized by voluntary movement'), [Synset(pos='n', offset='00002086', gloss='any living entity'), [Synset(pos='n', offset='00001740', gloss='anything having existence (living or nonliving)')]]]]]]]]]]
"""
tree = [self]
if depth != 0:
tree += [x.tree(rel, depth - 1, cut_mark) for x in rel(self)]
elif cut_mark:
tree += [cut_mark]
return tree
# Similarity methods
[docs] def path_similarity(self, other, verbose=False, simulate_root=True):
"""Path Distance Similarity:
Return a score denoting how similar two word senses are, based on the
shortest path that connects the senses in the is-a (hypernym/hypnoym)
taxonomy. The score is in the range 0 to 1, except in those cases where
a path cannot be found (will only be true for verbs as there are many
distinct verb taxonomies), in which case None is returned. A score of
1 represents identity i.e. comparing a sense with itself will return 1.
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type simulate_root: bool
:param simulate_root: The various verb taxonomies do not
synsets that are not connected. This flag (True by default)
creates a fake root that connects all the taxonomies. Set it
to false to disable this behavior. For the noun taxonomy,
there is usually a default root except for WordNet version 1.6.
If you are using wordnet 1.6, a fake root will be added for nouns
as well.
:return: A score denoting the similarity of the two ``Synset`` objects,
normally between 0 and 1. None is returned if no connecting path
could be found. 1 is returned if a ``Synset`` is compared with
itself.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s2 = Synset(LWN, None, pos='n', offset='03457380', gloss='a cutting or thrusting weapon with a long blade')
>>> s1.path_similarity(s2)
0.25
"""
distance = self.shortest_path_distance(
other, simulate_root=simulate_root and self._needs_root()
)
if distance is None or distance < 0:
return None
return 1.0 / (distance + 1)
[docs] def _lcs_ic(self, other, icreader, verbose=False): # pragma: no cover
"""Get the information content of the least common subsumer that has
the highest information content value. If two nodes have no
explicit common subsumer, assume that they share an artificial
root node that is the hypernym of all explicit roots.
:type synset1: Synset
:param synset1: First input synset.
:type synset2: Synset
:param synset2: Second input synset. Must be the same part of
speech as the first synset.
:type ic: WordNetICCorpusReader
:param ic: an information content reader object
:return: The information content of the two synsets and their most
informative subsumer
"""
if self._pos != other._pos:
raise WordNetError(
"Computing the least common subsumer requires "
"%s and %s to have the same part of speech." % (self, other)
)
ic1 = icreader.information_content(self)
ic2 = icreader.information_content(other)
subsumers = self.common_hypernyms(other)
if len(subsumers) == 0:
subsumer_ic = 0
else:
subsumer_ic = max(icreader.information_content(s) for s in subsumers)
if verbose:
print("> LCS Subsumer by content:", subsumer_ic)
return ic1, ic2, subsumer_ic
[docs] def lch_similarity(
self, other, verbose=False, simulate_root=True
): # pragma: no cover
"""Leacock Chodorow Similarity:
Return a score denoting how similar two word senses are, based on the
shortest path that connects the senses (as above) and the maximum depth
of the taxonomy in which the senses occur. The relationship is given as
-log(p/2d) where p is the shortest path length and d is the taxonomy
depth. Because this metric must compute the max depth of the entire synset
taxonomy, it can be very slow!
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type simulate_root: bool
:param simulate_root: The various verb taxonomies do not
share a single root which disallows this metric from working for
synsets that are not connected. This flag (True by default)
creates a fake root that connects all the taxonomies. Set it
to false to disable this behavior.
:return: A score denoting the similarity of the two ``Synset`` objects,
normally greater than 0. None is returned if no connecting path
could be found. If a ``Synset`` is compared with itself, the
maximum score is returned, which varies depending on the taxonomy
depth.
"""
if self._pos != other._pos:
raise WordNetError(
"Computing the lch similarity requires "
"%s and %s to have the same part of speech." % (self, other)
)
need_root = self._needs_root()
if self._pos not in self._wordnet_corpus_reader._max_depth:
self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root)
depth = self._wordnet_corpus_reader._max_depth[self._pos]
distance = self.shortest_path_distance(
other, simulate_root=simulate_root and need_root
)
if distance is None or distance < 0 or depth == 0:
return None
return -math.log((distance + 1) / (2.0 * depth))
[docs] def wup_similarity(self, other, verbose=False, simulate_root=True):
"""Wu-Palmer Similarity:
Return a score denoting how similar two word senses are, based on the
depth of the two senses in the taxonomy and that of their Least Common
Subsumer (most specific ancestor node). Previously, the scores computed
by this implementation did _not_ always agree with those given by
Pedersen's Perl implementation of WordNet Similarity. However, with
the addition of the simulate_root flag (see below), the score for
verbs now almost always agree but not always for nouns.
The LCS does not necessarily feature in the shortest path connecting
the two senses, as it is by gloss the common ancestor deepest in
the taxonomy, not closest to the two senses. Typically, however, it
will so feature. Where multiple candidates for the LCS exist, that
whose shortest path to the root node is the longest will be selected.
Where the LCS has multiple paths to the root, the longer path is used
for the purposes of the calculation.
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type simulate_root: bool
:param simulate_root: The various verb taxonomies do not
share a single root which disallows this metric from working for
synsets that are not connected. This flag (True by default)
creates a fake root that connects all the taxonomies. Set it
to false to disable this behavior.
:return: A float score denoting the similarity of the two ``Synset``
objects, normally greater than zero. If no connecting path between
the two senses can be found, None is returned.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = Synset(LWN, None, pos='n', offset='02542418', gloss='a short stabbing weapon with a pointed blade')
>>> s2 = Synset(LWN, None, pos='n', offset='03457380', gloss='a cutting or thrusting weapon with a long blade')
>>> s1.wup_similarity(s2)
0.8
"""
need_root = self._needs_root()
# Note that to preserve behavior from NLTK2 we set use_min_depth=True
# It is possible that more accurate results could be obtained by
# removing this setting and it should be tested later on
subsumers = self.lowest_common_hypernyms(
other, simulate_root=simulate_root and need_root, use_min_depth=True
)
# If no LCS was found return None
if len(subsumers) == 0:
return None
subsumer = self if self in subsumers else subsumers[0]
# Get the longest path from the LCS to the root,
# including a correction:
# - add one because the calculations include both the start and end
# nodes
depth = subsumer.max_depth() + 1
# Note: No need for an additional add-one correction for non-nouns
# to account for an imaginary root node because that is now
# automatically handled by simulate_root
# if subsumer._pos != NOUN:
# depth += 1
# Get the shortest path from the LCS to each of the synsets it is
# subsuming. Add this to the LCS path length to get the path
# length from each synset to the root.
len1 = self.shortest_path_distance(
subsumer, simulate_root=simulate_root and need_root
)
len2 = other.shortest_path_distance(
subsumer, simulate_root=simulate_root and need_root
)
if len1 is None or len2 is None:
return None
len1 += depth
len2 += depth
return (2.0 * depth) / (len1 + len2)
[docs] def res_similarity(self, other, icreader, verbose=False):
"""Resnik Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: WordNetICCorpusReader
:param ic: an information content reader
:return: A float score denoting the similarity of the two ``Synset``
objects. Synsets whose LCS is the root node of the taxonomy will
have a score of 0 (e.g. N['dog'][0] and N['table'][0]).
>>> from cltk.wordnet.wordnet import WordNetCorpusReader, WordNetICCorpusReader
>>> LASLA_IC = WordNetICCorpusReader(iso_code="lat", fileids=['ic-lasla.dat'])
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset('n', '02542418')
>>> s2 = LWN.synset_from_pos_and_offset('n', '03457380')
>>> s1.res_similarity(s2, LASLA_IC)
6.056495670686355
"""
ic1, ic2, lcs_ic = self._lcs_ic(other, icreader)
return lcs_ic
[docs] def jcn_similarity(self, other, icreader, verbose=False):
"""Jiang-Conrath Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node) and that of the two input Synsets. The relationship is
given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: WordNetICCorpusReader
:param ic: an information content reader
:return: A float score denoting the similarity of the two ``Synset``
objects.
>>> from cltk.wordnet.wordnet import WordNetCorpusReader, WordNetICCorpusReader
>>> LASLA_IC = WordNetICCorpusReader(iso_code='lat', fileids=['ic-lasla.dat'])
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset('n', '02542418')
>>> s2 = LWN.synset_from_pos_and_offset('n', '03457380')
>>> s1.jcn_similarity(s2, LASLA_IC)
0.23789011550933925
"""
if self == other:
return _INF
ic1, ic2, lcs_ic = self._lcs_ic(other, icreader)
# If either of the input synsets are the root synset, or have a
# frequency of 0 (sparse data problem), return 0.
if ic1 == 0 or ic2 == 0:
return 0
ic_difference = ic1 + ic2 - 2 * lcs_ic
if ic_difference == 0:
return _INF
return 1 / ic_difference
[docs] def lin_similarity(self, other, icreader, verbose=False):
"""Lin Similarity.
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node) and that of the two input Synsets. The relationship is
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: WordNetICCorpusReader
:param ic: an information content reader
:return: A float score denoting the similarity of the two ``Synset``
objects, in the range 0 to 1.
>>> from cltk.wordnet.wordnet import WordNetCorpusReader, WordNetICCorpusReader
>>> LASLA_IC = WordNetICCorpusReader(iso_code="lat", fileids=['ic-lasla.dat'])
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> s1 = LWN.synset_from_pos_and_offset('n', '02542418')
>>> s2 = LWN.synset_from_pos_and_offset('n', '03457380')
>>> s1.lin_similarity(s2, LASLA_IC)
0.7423716841366877
"""
ic1, ic2, lcs_ic = self._lcs_ic(other, icreader)
return (2.0 * lcs_ic) / (ic1 + ic2)
[docs] def _iter_hypernym_lists(self):
"""Get hypernyms.
:return: An iterator over ``Synset`` objects that are either proper
hypernyms or instance of hypernyms of the synset.
"""
todo = [self]
seen = set()
while todo:
for synset in todo:
seen.add(synset)
yield todo
todo = [
hypernym
for synset in todo
for hypernym in synset.hypernyms()
if hypernym not in seen
]
def __repr__(self):
return "Synset(pos='{}', offset='{}', gloss='{}')".format(
self.pos(), self.offset(), self.gloss()
)
@property
def _related(self):
if self.__related is None:
results = requests.get(
f"{self._wordnet_corpus_reader.host()}/api/synsets/{self.pos()}/{self.offset()}/relations/?format=json",
timeout=(30.0, 90.0),
)
if results and len(results.json()["results"]) != 0:
self.__related = results.json()["results"][0]["relations"]
else:
self.__related = []
return self.__related
def __eq__(self, other):
return self._pos == other._pos and self._offset == other._offset
def __ne__(self, other):
return not self == other
def __lt__(self, other):
if self._pos != other._pos:
raise WordNetError(
"operation undefined for '{}' and '{}'".format(self._pos, other._pos)
)
return self._offset < other._offset
def __hash__(self):
return hash(f"{self.pos()}#{self.offset()}")
######################################################################
# WordNet Corpus Reader
######################################################################
[docs]class WordNetCorpusReader(CorpusReader):
"""A corpus reader used to access a WordNet.
:param iso_code: The ISO code for one of the languages providing a WordNet API
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> animus = LWN.lemma('animus', 'n', 'n-s---mn2-')
>>> print(animus)
[Lemma(lemma='animus', pos='n', morpho='n-s---mn2-', uri='a2046')]
>>> dico = LWN.lemmas('dico', 'v')
>>> print(sorted(list(dico), key=lambda x: x.uri()))
[Lemma(lemma='dico', pos='v', morpho='v1spia--1-', uri='d1349'), Lemma(lemma='dico', pos='v', morpho='v1spia--3-', uri='d1350')]
>>> virtus = LWN.lemmas_from_uri('u0800')
>>> print(virtus)
[Lemma(lemma='uirtus', pos='n', morpho='n-s---fn3-', uri='u0800')]
>>> courage = LWN.synset('n#03805961')
>>> print(courage)
Synset(pos='n', offset='03805961', gloss='a quality of spirit that enables you to face danger of pain without showing fear')
>>> adverbs = LWN.synsets('r')
>>> print(len(list(adverbs)) > 3600)
True
"""
_DEFAULT_HOSTS = {
"san": "https://sanskritwordnet.unipv.it",
"grc": "https://greekwordnet.chs.harvard.edu",
"lat": "https://latinwordnet.exeter.ac.uk",
}
_ENCODING = "utf8"
# { Part of speech constants
_pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, PREP: 5}
_pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
# }
def __init__(self, iso_code, ignore_errors=False):
"""Construct a new WordNet corpus reader"""
super(WordNetCorpusReader, self).__init__(
encoding=self._ENCODING, root="", fileids=None
)
self._iso_code = iso_code
self._host = self._DEFAULT_HOSTS[self._iso_code]
self._ignore_errors = ignore_errors
# A cache so we don't have to reconstuct synsets
# Map from pos -> offset -> Synset
self._synset_cache = nesteddict()
# A cache so we don't have to reconstuct synsets
# Map from lemma -> pos -> morpho -> Lemma
self._lemma_cache = nesteddict()
# A lookup for the maximum depth of each part of speech. Useful for
# the lch similarity metric.
self._max_depth = defaultdict(dict)
[docs] def host(self):
return self._host
[docs] def _compute_max_depth(self, pos, simulate_root): # pragma: no cover
"""Compute the max depth for the given part of speech. This is
used by the lch similarity metric.
"""
depth = 0
for ii in self.synsets(pos=pos):
try:
depth = max(depth, ii.max_depth())
except RuntimeError:
print(ii)
if simulate_root:
depth += 1
self._max_depth[pos] = depth
[docs] def get_status(self): # pragma: no cover
results = requests.get(
f"{self.host()}/api/status/?format=json", timeout=(30.0, 90.0)
)
return results
#############################################################
# Loading Lemmas
#############################################################
[docs] def lemma(self, lemma, pos="", morpho="", return_ambiguous=True):
"""Takes ``lemma`` and finds matching headword.
If ``pos`` or ``morph`` is provided, the results found through
``lemma`` alone are filtered. ``pos`` tags are in the form
``n`` for noun, ``v`` for verb, ``a`` for adjective, ``r`` for adverb.
If ``return_ambiguous`` is ``False``, only the first matching lemma
is returned as a single-element list. If ``True``, (default) all the
matching lemmas will be returned.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> LWN.lemma('baculum')
[Lemma(lemma='baculum', pos='n', morpho='n-s---nn2-', uri='b0034')]
"""
resolved = []
if lemma in self._lemma_cache:
logger.debug(f"lemma found in cache: {lemma}")
if pos and pos in self._lemma_cache[lemma]: # pragma: no cover
logger.debug(f"pos found in cache: {pos}")
if morpho and morpho in self._lemma_cache[lemma][pos]:
logger.debug(f"morpho found in cache: {morpho}")
resolved.extend(self._lemma_cache[lemma][pos][morpho])
else:
resolved.extend(
[
self._lemma_cache[lemma][pos][morpho][uri]
for morpho in self._lemma_cache[lemma][pos]
for uri in self._lemma_cache[lemma][pos][morpho]
]
)
else:
resolved.extend(
[
self._lemma_cache[lemma][pos][morpho][uri]
for pos in self._lemma_cache[lemma]
for morpho in self._lemma_cache[lemma][pos]
for uri in self._lemma_cache[lemma][pos][morpho]
if morpho in self._lemma_cache[lemma][pos]
]
)
if not resolved:
logger.debug(f"REQUEST: {lemma}, (pos={pos}, morpho={morpho})")
results = self.json = requests.get(
f"{self.host()}/api/lemmas/{lemma if lemma else '*'}/{pos if pos else '*'}"
f"/{morpho if morpho else '*'}?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()["results"]
for item in data:
l = Lemma(self, **(item))
resolved.append(l)
self._lemma_cache[lemma][item["pos"]][item["morpho"]][
item["uri"]
] = l
if return_ambiguous:
return resolved
else:
return resolved[:1]
[docs] def lemma_from_uri(self, uri):
"""Get lemma from URI.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> LWN.lemma_from_uri('b0034')
Lemma(lemma='baculum', pos='n', morpho='n-s---nn2-', uri='b0034')
"""
results = self.json = requests.get(
f"{self.host()}/api/uri/{uri}?format=json", timeout=(30.0, 90.0)
)
if results:
data = results.json()["results"]
if len(data) > 1:
ambiguous = [
f"{result['lemma']} ({result['morpho']})" for result in results
]
raise WordNetError(f"can't disambiguate {', '.join(ambiguous)}")
l = Lemma(self, **data[0])
self._lemma_cache[data[0]["lemma"]][data[0]["pos"]][data[0]["morpho"]][
data[0]["uri"]
] = l
return l
[docs] def semfield(self, code, english):
"""Semfield.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> LWN.semfield('910', 'Geography & travel')
Semfield(code='910', english='Geography & travel')
"""
english = re.sub(" ", "_", english)
# load semfield information
results = self.json = requests.get(
f"{self.host()}/api/semfields/{code}/{english}/?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()["results"]
if len(data) == 0:
raise WordNetError(f"semfield {code} '{english}' not found")
# Return the semfield object.
return Semfield(self, data[0]["code"], data[0]["english"])
#############################################################
# Loading Synsets
#############################################################
[docs] def synset(self, id):
"""Get synset.
:param id: Synset id, consisting of POS and offset separated by '#'
:return: Synset object
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> LWN.synset('r#L2556264')
Synset(pos='r', offset='L2556264', gloss='in the manner of a woman')
"""
pos, offset = SENSENUM_RE.search(id).groups()
# load synset information
synset = self.synset_from_pos_and_offset(pos, offset)
if synset is None:
raise WordNetError(f"synset {id} not found")
# Return the synset object.
return synset
[docs] def synset_from_pos_and_offset(self, pos, offset):
"""Get synset from pos.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> LWN.synset_from_pos_and_offset('r', 'L2556264')
Synset(pos='r', offset='L2556264', gloss='in the manner of a woman')
"""
# Check to see if the synset is in the cache
if offset in self._synset_cache[pos]:
return self._synset_cache[pos][offset]
results = requests.get(
f"{self.host()}/api/synsets/{pos}/{offset}?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()["results"][0]
synset = Synset(self, **data)
self._synset_cache[pos][offset] = synset
return synset
#############################################################
# Retrieve synsets and lemmas.
#############################################################
[docs] def lemmas(self, lemma=None, pos=None, morpho=None):
"""Return all Lemma objects with a name matching the specified lemma
name, part of speech tag or morphological descriptor.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> sorted(list(LWN.lemmas('dico', 'v')), key=lambda x: x.uri())
[Lemma(lemma='dico', pos='v', morpho='v1spia--1-', uri='d1349'), Lemma(lemma='dico', pos='v', morpho='v1spia--3-', uri='d1350')]
"""
results = requests.get(
f"{self.host()}/api/lemmas/{lemma if lemma else '*'}/{pos if pos else '*'}/"
f"{morpho if morpho else '*'}?format=json",
timeout=(30.0, 90.0),
).json()
if results:
return (
Lemma(self, lemma["lemma"], lemma["pos"], lemma["morpho"], lemma["uri"])
for lemma in results["results"]
)
[docs] def lemmas_from_uri(self, uri):
"""Get lemmas from URI.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> list(sorted(LWN.lemmas_from_uri('f1052')))
[Lemma(lemma='frumentaria', pos='n', morpho='n-s---fn1-', uri='f1052'), Lemma(lemma='frumentarius', pos='n', morpho='n-s---mn2-', uri='f1052'), Lemma(lemma='frumentarius', pos='a', morpho='aps---mn1-', uri='f1052')]
"""
results = self.json = requests.get(
f"{self.host()}/api/uri/{uri}?format=json", timeout=(30.0, 90.0)
)
if results:
data = results.json()["results"]
lemmas_list = []
for result in data:
l = Lemma(self, **result)
self._lemma_cache[result["lemma"]][result["pos"]][result["morpho"]][
result["uri"]
] = l
lemmas_list.append(l)
return lemmas_list
[docs] def synsets(self, pos=None):
"""Load all synsets for a given part of speech, if specified.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> len(list(LWN.synsets('r'))) > 3000
True
"""
synsets_list = []
results = requests.get(
f"{self.host()}/api/synsets/{pos if pos else '*'}/?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()
synsets_list.extend(data["results"])
while data["next"]:
data = requests.get(data["next"], timeout=(30.0, 90.0)).json()
synsets_list.extend(data["results"])
return (
Synset(
self,
synset["language"],
synset["pos"],
synset["offset"],
synset["gloss"],
)
for synset in synsets_list
)
[docs] def semfields(self, code=None):
"""Load all semfields for a given code, if specified.
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> list(LWN.semfields('300'))
[Semfield(code='300', english='Social Sciences'), Semfield(code='300', english='Social Sciences, Sociology & Anthropology'), Semfield(code='300', english='Social sciences')]
"""
semfields_list = []
if code is None: # pragma: no cover
results = requests.get(
f"{self.host()}/api/semfields/?format=json", timeout=(30.0, 90.0)
).json()
semfields_list.extend(results["results"])
while results["next"]:
results = requests.get(results["next"], timeout=(30.0, 90.0)).json()
semfields_list.extend(results["results"])
else:
results = requests.get(
f"{self.host()}/api/semfields/{code}/?format=json", timeout=(30.0, 90.0)
)
if results:
data = results.json()["results"]
semfields_list.extend(data)
return sorted(
[
Semfield(self, semfield["code"], semfield["english"])
for semfield in semfields_list
],
key=lambda x: (x.code(), x.english()),
)
#############################################################
# Lemmatizer
#############################################################
[docs] def lemmatize(self, form: str, morpho: str = None):
"""Lemmatizes a word form.
:param form: The form to lemmatize, as a string
:param morpho: Optional 10-place morphological descriptor, used as a filter
:return: A list of matching Lemma objects
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> print(list(LWN.lemmatize('pumice')))
[Lemma(lemma='pumex', pos='n', morpho='n-s---cn3-', uri='p4512')]
"""
if self._iso_code in ("skt", "grk"):
raise ValueError(
f"Lemmatization not currently available for '{self._iso_code}'"
)
form = form.translate(punctuation)
if form:
results = requests.get(
f"{self.host()}/lemmatize/{form}/{morpho if morpho else ''}?format=json",
timeout=(30.0, 90.0),
)
if results and results.json():
return (
Lemma(
self,
result["lemma"]["lemma"],
result["lemma"]["morpho"][0],
result["lemma"]["morpho"],
result["lemma"]["uri"],
)
for result in results.json()
)
return []
#############################################################
# Translater
#############################################################
[docs] def translate(self, language: str, form: str, pos: str = "*"):
"""Translates an English, French, Spanish, or Italian word into Latin.
:param language: 'en', 'fr', 'es', 'it' indicating the source language
:param form: The word to translate
:param pos: Optionally, a part-of-speech ('n', 'v', 'a', 'r') indicator
used as a filter
:return: A list of Lemma objects
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> offspring_translations = list(LWN.translate('en', 'offspring'))
>>> print('pusio' in [lemma.lemma() for lemma in offspring_translations])
True
"""
pos = f"{pos}/" if pos else ""
results = requests.get(
f"{self.host()}/translate/{language}/{form}/{pos}?format=json",
timeout=(30.0, 90.0),
)
if results:
data = results.json()["results"]
return (
Lemma(self, lemma["lemma"], lemma["pos"], lemma["morpho"], lemma["uri"])
for lemma in data
)
######################################################################
# WordNet Information Content Corpus Reader
######################################################################
[docs]class WordNetICCorpusReader(CorpusReader):
"""A corpus reader for the WordNet information content corpus.
:param root: The root directory where the information content file is stored.
:param fileids: A list of file names, relative to the root directory, in this
case a single file containing information content for a corpus.
>>> from cltk.wordnet.wordnet import WordNetICCorpusReader
>>> LWNIC = WordNetICCorpusReader(iso_code='lat', fileids=['ic-lasla.dat'])
"""
def __init__(self, iso_code, root=None, fileids=None):
if not root:
root = os.path.join(
get_cltk_data_dir(),
f"{iso_code}/model/{iso_code}_models_cltk/semantics/wordnet/",
)
CorpusReader.__init__(self, root, fileids, encoding="utf8")
if fileids is not None:
self.load_ic(fileids[0])
else:
self._ic = None
[docs] def ic(self): # pragma: no cover
return self._ic
#############################################################
# Create information content from corpus
#############################################################
[docs] def create_ic(
self, iso_code, corpus, weight_senses_equally=False, smoothing=1.0
): # pragma: no cover
"""Creates an information content lookup dictionary from a corpus.
:type corpus: CorpusReader
:param corpus: The corpus from which we create an information
content dictionary.
:type weight_senses_equally: bool
:param weight_senses_equally: If this is True, gives all
possible senses equal weight rather than dividing by the
number of possible senses. (If a word has 3 synses, each
sense gets 0.3333 per appearance when this is False, 1.0 when
it is true.)
:param smoothing: How much do we smooth synset counts (default is 1.0)
:type smoothing: float
:return: An information content dictionary
"""
WN = WordNetCorpusReader(iso_code=iso_code)
counts = FreqDist()
for ww in corpus.words():
results = WN.lemmatize(ww)
for lemma in results:
counts[lemma] += 1
ic = {}
for pp in POS_LIST:
ic[pp] = defaultdict(float)
# Initialize the counts with the smoothing value
if smoothing > 0.0:
for ss in WN.synsets():
pos = ss._pos
ic[pos][ss._offset] = smoothing
for ww in counts:
possible_synsets = list(ww.synsets())
if len(possible_synsets) == 0:
continue
# Distribute weight among possible synsets
weight = float(counts[ww])
if not weight_senses_equally:
weight /= float(len(possible_synsets))
for ss in possible_synsets:
pos = ss._pos
for level in ss._iter_hypernym_lists():
for hh in level:
ic[pos][hh._offset] += weight
# Add the weight to the root
ic[pos][0] += weight
self._ic = ic
[docs] def write_ic(self, corpus_name): # pragma: no cover
if self._ic is None:
raise WordNetError("No information content available")
get_synset = self.synset_from_pos_and_offset
path = os.path.join(self._root, "ic-{}.dat".format(corpus_name))
with codecs.open(path, "w", "utf8") as fp:
fp.write("lwnver:{}\n".format(self.get_status()["last_modified"]))
for pp in POS_LIST:
for offset in self._ic[pp]:
ss = get_synset(pp, offset)
if len(ss.hypernyms()) == 0:
fp.write("{} {} ROOT\n".format(ss.id(), self._ic[pp][offset]))
else:
fp.write("{} {}\n".format(ss.id(), self._ic[pp][offset]))
self._fileids = ["ic-{}.dat".format(corpus_name)]
[docs] def load_ic(self, icfile=None): # pragma: no cover
"""Load an information content file and return a dictionary
whose keys are POS types and whose values are dictionaries
that map from synsets to information content values.
:type icfile: str
:param icfile: The name of the wordnet_ic file (e.g. "ic-latin_library.dat")
:return: An information content dictionary
>>> from cltk.wordnet.wordnet import WordNetICCorpusReader
>>> LWNIC = WordNetICCorpusReader(iso_code="lat")
>>> LWNIC.load_ic('ic-lasla.dat')
"""
if not icfile:
if self._fileids:
icfile = self._fileids[0]
else:
raise WordNetError("No information content file specified")
ic = {}
for pos in POS_LIST:
ic[pos] = defaultdict(float)
for num, line in enumerate(self.open(icfile)):
if num == 0: # skip the header
continue
fields = line.split()
pos, offset = fields[0].split("#")
value = float(fields[1])
if len(fields) == 3 and fields[2] == "ROOT":
# Store root count.
ic[pos][0] += value
if value != 0:
ic[pos][offset] = value
self._fileids = [icfile]
self._ic = ic
[docs] def information_content(self, synset): # pragma: no cover
"""Retrieve the information content score for a synset.
>>> from cltk.wordnet.wordnet import WordNetCorpusReader, WordNetICCorpusReader
>>> LWN = WordNetCorpusReader(iso_code="lat")
>>> LWNIC = WordNetICCorpusReader(iso_code="lat", fileids=['ic-lasla.dat'])
>>> s = LWN.synset_from_pos_and_offset('n', '02542418')
>>> LWNIC.information_content(s)
9.256474058450094
"""
if not self._ic:
raise WordNetError("No information content file has been loaded")
try:
icpos = self._ic[synset._pos]
except KeyError:
msg = "Information content file has no entries for part-of-speech: %s"
raise WordNetError(msg % synset._pos)
counts = icpos[synset._offset]
if counts == 0:
return _INF
else:
return -math.log(counts / icpos[0])
relation_types = {
"!": "antonyms",
"@": "hypernyms",
"~": "hyponyms",
"#m": "member-of",
"#s": "substance-of",
"#p": "part-of",
"%m": "has-member",
"%s": "has-substance",
"%p": "has-part",
"=": "attribute-of",
"|": "nearest",
"+r": "has-role",
"-r": "is-role-of",
"*": "entails",
">": "causes",
"^": "also-see",
"$": "verb-group",
"&": "similar-to",
"<": "participle",
"+c": "composed-of",
"-c": "composes",
"\\": "derived-from",
"/": "related-to",
}
# Example usage
if __name__ == "__main__":
LWN = WordNetCorpusReader(iso_code="lat")
lemmas = list(LWN.lemmatize("virtutem"))
print("Lemmatized 'virtutem':", lemmas)
virtus = LWN.lemma_from_uri("u0800")
print("Fetched lemma by URI:", virtus)
print("...with synsets:")
for synset in virtus.synsets():
print("-", synset.gloss())
animus = LWN.lemma("animus", return_ambiguous=False)
print("Fetched lemma", animus)
print("'Virtus' and 'animus' share the following synsets:")
for synset in set(virtus.synsets()).intersection(set(animus.synsets())):
print("-", synset.id(), "in semfields:", list(synset.semfields()))
print(
"...with synonyms:", ", ".join([lemma.lemma() for lemma in synset.lemmas()])
)
print(
"...and antonyms:",
", ".join(
[
lemma.lemma()
for antonym in synset.antonyms()
for lemma in antonym.lemmas()
]
),
)
courage = list(LWN.translate("en", "courage", "n"))
print("Translating 'courage':", courage)
s1 = LWN.synset("n#02542418")
print("Fetched synset:", s1.id(), "=", s1.gloss())
s2 = LWN.synset("n#03457380")
print("Fetched synset:", s2.id(), "=", s2.gloss())
print("Common hypernyms:")
for hypernym in sorted(s1.common_hypernyms(s2), key=lambda x: x.offset()):
print("-", hypernym.gloss())