"""``Process`` classes for accessing the Stanza project."""
from copy import deepcopy
from dataclasses import dataclass
from typing import Dict, List, Tuple
import stanza
from boltons.cacheutils import cachedproperty
from cltk.core.data_types import Doc, Process, Word
from cltk.dependency.stanza import StanzaWrapper
from cltk.dependency.tree import DependencyTree
from cltk.morphology.morphosyntax import (
MorphosyntacticFeatureBundle,
from_ud,
to_categorial,
)
[docs]@dataclass
class StanzaProcess(Process):
"""A ``Process`` type to capture everything
that the ``stanza`` project can do for a
given language.
.. note::
``stanza`` has only partial functionality available for some languages.
>>> from cltk.languages.example_texts import get_example_text
>>> process_stanza = StanzaProcess(language="lat")
>>> isinstance(process_stanza, StanzaProcess)
True
>>> from stanza.models.common.doc import Document
>>> output_doc = process_stanza.run(Doc(raw=get_example_text("lat")))
>>> isinstance(output_doc.stanza_doc, Document)
True
"""
language: str = None
@cachedproperty
def algorithm(self):
return StanzaWrapper.get_nlp(language=self.language)
[docs] def run(self, input_doc: Doc) -> Doc:
output_doc = deepcopy(input_doc)
stanza_wrapper = self.algorithm
if output_doc.normalized_text:
input_text = output_doc.normalized_text
else:
input_text = output_doc.raw
stanza_doc = stanza_wrapper.parse(input_text)
cltk_words = self.stanza_to_cltk_word_type(stanza_doc)
output_doc.words = cltk_words
output_doc.stanza_doc = stanza_doc
return output_doc
[docs] @staticmethod
def stanza_to_cltk_word_type(stanza_doc):
"""Take an entire ``stanza`` document, extract
each word, and encode it in the way expected by
the CLTK's ``Word`` type.
>>> from cltk.dependency.processes import StanzaProcess
>>> from cltk.languages.example_texts import get_example_text
>>> process_stanza = StanzaProcess(language="lat")
>>> cltk_words = process_stanza.run(Doc(raw=get_example_text("lat"))).words
>>> isinstance(cltk_words, list)
True
>>> isinstance(cltk_words[0], Word)
True
>>> cltk_words[0]
Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Gallia', \
pos=noun, lemma='mallis', stem=None, scansion=None, xpos='A1|grn1|casA|gen2', upos='NOUN', \
dependency_relation='nsubj', governor=3, \
features={Case: [nominative], Degree: [positive], Gender: [feminine], Number: [singular]}, category={F: [neg], N: [pos], V: [neg]}, \
embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None)
"""
words_list = list() # type: List[Word]
for sentence_index, sentence in enumerate(stanza_doc.sentences):
sent_words = dict() # type: Dict[int, Word]
indices = list() # type: List[Tuple[int, int]]
for token_index, token in enumerate(sentence.tokens):
stanza_word = token.words[0] # type: stanza.pipeline.doc.Word
# TODO: Figure out how to handle the token indexes, esp 0 (root) and None (?)
cltk_word = Word(
index_token=int(stanza_word.id)
- 1, # subtract 1 from id b/c Stanza starts their index at 1
index_sentence=sentence_index,
string=stanza_word.text, # same as ``token.text``
pos=from_ud("POS", stanza_word.pos),
xpos=stanza_word.xpos,
upos=stanza_word.upos,
lemma=stanza_word.lemma,
dependency_relation=stanza_word.deprel,
governor=stanza_word.head - 1
if stanza_word.head
else -1, # note: if val becomes ``-1`` then no governor, ie word is root; ``fro`` gives None sometimes, what does this mean?
) # type: Word
# convert UD features to the normalized CLTK features
raw_features = (
[tuple(f.split("=")) for f in stanza_word.feats.split("|")]
if stanza_word.feats
else []
)
cltk_features = [
from_ud(feature_name, feature_value)
for feature_name, feature_value in raw_features
]
cltk_word.features = MorphosyntacticFeatureBundle(*cltk_features)
cltk_word.category = to_categorial(cltk_word.pos)
cltk_word.stanza_features = stanza_word.feats
# sent_words[cltk_word.index_token] = cltk_word
words_list.append(cltk_word)
# # TODO: Fix this, I forget what we were tracking in this
# indices.append(
# (
# int(stanza_word.governor)
# - 1, # -1 to match CLTK Word.index_token
# int(stanza_word.parent_token.index)
# - 1, # -1 to match CLTK Word.index_token
# )
# )
# # TODO: Confirm that cltk_word.parent is ever getting filled out. Only for some lang models?
# for idx, cltk_word in enumerate(sent_words.values()):
# governor_index, parent_index = indices[idx] # type: int, int
# cltk_word.governor = governor_index if governor_index >= 0 else None
# if cltk_word.index_token != sent_words[parent_index].index_token:
# cltk_word.parent = parent_index
return words_list
[docs]@dataclass
class GreekStanzaProcess(StanzaProcess):
"""Stanza processor for Ancient Greek."""
language: str = "grc"
description: str = "Default process for Stanza for the Ancient Greek language."
[docs]@dataclass
class LatinStanzaProcess(StanzaProcess):
"""Stanza processor for Latin."""
language: str = "lat"
description: str = "Default process for Stanza for the Latin language."
[docs]@dataclass
class OCSStanzaProcess(StanzaProcess):
"""Stanza processor for Old Church Slavonic."""
language: str = "chu"
description: str = (
"Default process for Stanza for the Old Church Slavonic language."
)
[docs]@dataclass
class OldFrenchStanzaProcess(StanzaProcess):
"""Stanza processor for Old French."""
language: str = "fro"
description: str = "Default process for Stanza for the Old French language."
[docs]@dataclass
class GothicStanzaProcess(StanzaProcess):
"""Stanza processor for Gothic."""
language: str = "got"
description: str = "Default process for Stanza for the Gothic language."
[docs]@dataclass
class CopticStanzaProcess(StanzaProcess):
"""Stanza processor for Coptic."""
language: str = "cop"
description: str = "Default process for Stanza for the Coptic language."
[docs]@dataclass
class ChineseStanzaProcess(StanzaProcess):
"""Stanza processor for Classical Chinese."""
language: str = "lzh"
description: str = "Default process for Stanza for the Classical Chinese language."
[docs]class TreeBuilderProcess(Process):
"""A ``Process`` that takes a doc containing sentences of CLTK words
and returns a dependency tree for each sentence.
TODO: JS help to make this work, illustrate better.
>>> from cltk import NLP
>>> nlp = NLP(language="got")
>>> from cltk.dependency.processes import TreeBuilderProcess
>>> nlp.pipeline.add_process(TreeBuilderProcess) # doctest: +SKIP
>>> from cltk.languages.example_texts import get_example_text # doctest: +SKIP
>>> doc = nlp.analyze(text=get_example_text("got")) # doctest: +SKIP
>>> len(doc.trees) # doctest: +SKIP
4
"""
[docs] def algorithm(self, doc):
doc.trees = [DependencyTree.to_tree(sentence) for sentence in doc.sentences]
return doc