diff --git a/README.md b/README.md index 5511a71..f601dcb 100755 --- a/README.md +++ b/README.md @@ -6,46 +6,27 @@ which can be installed into a Spacy pipeline. They annotate the Spacy parse tree with additional attributes that make it easy to summarize information about features of student writing. +Before You Install +------------ + +It is helpful to note that the use of AWE Components is best tested using [AWE_Workbench](https://github.com/ArgLab/AWE_Workbench), which utilizes the features defined in AWE Components. There are a series of automatic tests which can be run to verify or validate AWE Components; in addition, there are examples, a web server for parsing documents, and an interactive document highlighting tool for visualizing the document features which are derived from AWE Components. + +See AWE Workbench's installations steps and verify that you'd want to use it instead of installing AWE Components directly. + Installation ------------ -Set up Python 3.9. 3.8 will *not* work. If you wish to use `conda`: +Set up Python 3.11. If you wish to use `conda`: - conda create -n test_install python=3.9 pip + conda create -n test_install python=3.11 pip pip install pip --upgrade conda activate test_install If you wish to use plain old `pip` with `virtualenvwrapper`: - mkvirtualenv awe_components --python=/usr/bin/python3.9 + mkvirtualenv awe_components --python=/usr/bin/python3.11 pip install pip --upgrade -Install prerequisites: - -[Holmes Extractor Expandable](https://github.com/ETS-Next-Gen/holmes-extractor-expandable): - - git clone git@github.com:ETS-Next-Gen/holmes-extractor-expandable.git - cd holmes-extractor-expandable/~ - pip install . - -[AWE Language Tool](https://github.com/ETS-Next-Gen/AWE_LanguageTool): - - git clone git@github.com:ETS-Next-Gen/AWE_LanguageTool.git - cd AWE_LanguageTool/ - pip install . - -[AWE Spell Correct](https://github.com/ETS-Next-Gen/AWE_SpellCorrect) - - git clone git@github.com:ETS-Next-Gen/AWE_SpellCorrect.git - cd AWE_SpellCorrect/ - pip install . - -[AWE Lexica](https://github.com/ETS-Next-Gen/AWE_Lexica) - - git clone git@github.com:ETS-Next-Gen/AWE_Lexica.git - cd AWE_Lexica/ - pip install . - Then from the AWE Workbench Components directory: pip install . diff --git a/awe_components/components/contentSegmentation.py b/awe_components/components/contentSegmentation.py index 6895545..58903d9 100644 --- a/awe_components/components/contentSegmentation.py +++ b/awe_components/components/contentSegmentation.py @@ -1,10 +1,13 @@ #!/usr/bin/env python3 # Copyright 2022, Educational Testing Service -from .utility_functions import * +from .utility_functions import \ + match_related_form, getRoot, \ + in_past_tense_scope, newSpanEntry, \ + AWE_Info + from operator import itemgetter -import spacy -from spacy.tokens import Token, Doc +from spacy.tokens import Doc from spacy.language import Language import wordfreq diff --git a/awe_components/components/lexicalClusters.py b/awe_components/components/lexicalClusters.py index 42e898c..796ac19 100644 --- a/awe_components/components/lexicalClusters.py +++ b/awe_components/components/lexicalClusters.py @@ -2,24 +2,18 @@ # Copyright 2022, Educational Testing Service import re -import spacy -import srsly +import json import wordfreq import numpy as np -import os from collections import OrderedDict -from scipy.spatial.distance import cosine -# Standard cosine distance metric - from sklearn.preprocessing import StandardScaler from sklearn.cluster import AgglomerativeClustering from spacy.tokens import Token, Doc from spacy.language import Language -from .utility_functions import * -from ..errors import * +from .utility_functions import ResolveReference, all_zeros, AWE_Info lang = "en" @@ -480,7 +474,7 @@ def devword(token): # flag assignClusterIDs to run # by setting it to a non None value token.doc._.clusterInfo_ = [] - self.assignClusterIDs(token.doc) + assignClusterIDs(token.doc) devlist = [token.text \ for token \ in developmentContentWords(token.doc)] diff --git a/awe_components/components/lexicalFeatures.py b/awe_components/components/lexicalFeatures.py index 0de6b01..cc4e282 100644 --- a/awe_components/components/lexicalFeatures.py +++ b/awe_components/components/lexicalFeatures.py @@ -19,18 +19,14 @@ import importlib.resources import math -import numpy as np import os -import re -from varname import nameof # English dictionary. Contains information on senses associated with words # (a lot more, but that's what we're currently using it for) from nltk.corpus import wordnet from scipy.spatial.distance import cosine # Standard cosine distance metric from spacy.language import Language -from spacy.tokens import Doc, Span, Token -from spacy.vocab import Vocab +from spacy.tokens import Doc, Token import srsly import statistics # https://github.com/rspeer/wordfreq @@ -41,7 +37,17 @@ import awe_lexica -from .utility_functions import * # <-- Paul, import only what you need here +from .utility_functions import \ + setExtensionFunctions, alphanum_word, \ + sylco, content_tags, \ + ResolveReference, AWE_Info, \ + possessive_or_determiner, personal_or_indefinite_pronoun, \ + all_zeros, is_temporal, \ + locative_adverbs, existential_there, \ + major_locative_prepositions, all_locative_prepositions, \ + loc_sverbs, loc_overbs, \ + deictics + from ..errors import LexiconMissingError def lexicon_path(lexicon): diff --git a/awe_components/components/syntaxDiscourseFeats.py b/awe_components/components/syntaxDiscourseFeats.py index 7a0a63e..08d8acc 100644 --- a/awe_components/components/syntaxDiscourseFeats.py +++ b/awe_components/components/syntaxDiscourseFeats.py @@ -1,26 +1,25 @@ #!/usr/bin/env python3 # Copyright 2022, Educational Testing Service -import math import os import srsly -from varname import nameof -from enum import Enum -from spacy.tokens import Doc, Span, Token +from spacy.tokens import Doc, Token from spacy.language import Language from scipy.spatial.distance import cosine # Standard cosine distance metric -from .utility_functions import * -from ..errors import * -from importlib import resources - -from nltk.corpus import wordnet -# English dictionary. Contains information on senses associated with words -# (a lot more, but that's what we're currently using it for) +from .utility_functions import \ + setExtensionFunctions, AWE_Info, \ + in_past_tense_scope, getRoot, \ + temporalPhrase, newSpanEntry, \ + adj_noun_or_verb, content_tags, \ + possessive_or_determiner, ResolveReference, \ + tensed_clause +from importlib import resources +from ..errors import LexiconMissingError @Language.factory("syntaxdiscoursefeatures") def SyntaxAndDiscourseFeatures(nlp, name): @@ -45,21 +44,20 @@ class SyntaxAndDiscourseFeatDef(object): ) as filepath: TRANSITION_CATEGORIES_PATH = filepath - datapaths = [{'pathname': nameof(TRANSITION_TERMS_PATH), - 'value': TRANSITION_TERMS_PATH}, - {'pathname': nameof(TRANSITION_CATEGORIES_PATH), - 'value': TRANSITION_CATEGORIES_PATH}] - transition_terms = {} transition_categories = {} def package_check(self, lang): - for path in self.datapaths: - if not os.path.exists(path['value']): - raise LexiconMissingError( - "Trying to load AWE Workbench Lexicon Module \ - without {name} datafile".format(name=path['pathname']) - ) + if not os.path.exists(self.TRANSITION_TERMS_PATH): + raise LexiconMissingError( + "Trying to load AWE Workbench Syntax and Discourse Feature \ + Module without supporting datafile {}".format(self.TRANSITION_TERMS_PATH) + ) + if not os.path.exists(self.TRANSITION_CATEGORIES_PATH): + raise LexiconMissingError( + "Trying to load AWE Workbench Syntax and Discourse Feature \ + Module without supporting datafile {}".format(self.TRANSITION_CATEGORIES_PATH) + ) def load_lexicons(self, lang): self.transition_terms = \ diff --git a/awe_components/components/utility_functions.py b/awe_components/components/utility_functions.py index 67877df..4230716 100644 --- a/awe_components/components/utility_functions.py +++ b/awe_components/components/utility_functions.py @@ -2899,6 +2899,9 @@ def setTokenEntry(name, token, value): # attribute. # # TBD: put security check in for this # ####################################### + elif "blob" in name: + name = name.replace("blob.", "") + entry['value'] = getattr(token._.blob, name) elif token.has_extension(name): # TODO: Use Token.get_extension # https://spacy.io/api/token @@ -3384,6 +3387,16 @@ def AWE_Info(document: Doc, raise AWE_Workbench_Error( 'Invalid indicator ' + indicator) + # QUICK FIX: spacytextblob no longer references polarity, subjectivity, + # nor assessments via doc._.X, but rather doc._.blob.X + # We are quickly fixing this problem in AWE_Info + if indicator == "polarity": + indicator = "blob.polarity" + elif indicator == "subjectivity": + indicator = "blob.subjectivity" + elif indicator == "assessments": + indicator = "blob.assessments" + if infoType == 'Doc': baseInfo = createSpanInfo(indicator, document) diff --git a/awe_components/components/viewpointFeatures.py b/awe_components/components/viewpointFeatures.py index 92d10ef..5ee9d16 100644 --- a/awe_components/components/viewpointFeatures.py +++ b/awe_components/components/viewpointFeatures.py @@ -3,10 +3,7 @@ import os import srsly -import imp -from enum import Enum -from collections import OrderedDict from spacy.tokens import Doc, Span, Token from spacy.language import Language @@ -16,8 +13,95 @@ from nltk.corpus import wordnet # (a lot more, but that's what we're currently using it for) -from .utility_functions import * -from ..errors import * +from .utility_functions import \ + AWE_Info, \ + absolute_degree, \ + adjectival_complement_dependencies, \ + adjectival_mod_dependencies , \ + adjectival_predicates, \ + animate_ent_type , \ + auxiliary_dependencies, \ + auxiliary_or_adverb, \ + be_verbs , \ + clausal_complements , \ + clausal_modifier_dependencies , \ + clausal_subject_or_complement, \ + common_evaluation_adjective, \ + common_hedge_word, \ + complements , \ + containsDistinctReference, \ + content_pos , \ + contracted_verb, \ + contraction, \ + core_temporal_preps , \ + coreViewpointPredicate, \ + dative_preps , \ + demonstratives , \ + elliptical_verb, \ + emphatic_adjective, \ + emphatic_adjective, \ + emphatic_adverb, \ + first_person_pronouns , \ + function_word_tags , \ + generalArgumentPredicate, \ + general_complements_and_modifiers , \ + generalViewpointPredicate, \ + getDative, \ + getLightVerbs, \ + getLinkedNodes, \ + getLogicalObject, \ + getObject, \ + getPrepObject, \ + getRoot, \ + getRoots, \ + getSubject, \ + getSubject, \ + getTensedVerbHead, \ + illocutionary_tag, \ + inanimate_3sg_pronouns, \ + indefinite_comparison, \ + indefinite_pronoun , \ + in_modal_scope, \ + in_past_tense_scope, \ + is_definite_nominal, \ + isRoot, \ + loose_clausal_dependencies , \ + newSpanEntry, \ + newTokenEntry, \ + nominal_pos , \ + nonhuman_ent_type , \ + object_predicate_dependencies , \ + object_predicate_dependencies, \ + other_conversational_idioms, \ + other_conversational_vocabulary, \ + personal_or_indefinite_pronoun , \ + personal_or_indefinite_pronoun , \ + pos_degree_mod , \ + prehead_modifiers2 , \ + present_semimodals , \ + private_mental_state_tag, \ + quantifying_determiners, \ + quotationMark, \ + raising_complement, \ + ResolveReference, \ + rootTree, \ + scanForAnimatePotentialAntecedents, \ + second_person_pronouns , \ + setExtensionFunctions, \ + stance_adverb, \ + stancePredicate, \ + subject_dependencies , \ + subject_or_object_nom , \ + takesBareInfinitive, \ + tensed_clause, \ + third_person_pronouns , \ + tough_complement, \ + underlying_object_dependencies , \ + verbal_mod_dependencies , \ + verbal_pos , \ + wh_question_word + +from ..errors import LexiconMissingError from importlib import resources @@ -65,13 +149,13 @@ class ViewpointFeatureDef: def package_check(self, lang): if not os.path.exists(self.STANCE_PERSPECTIVE_PATH): raise LexiconMissingError( - "Trying to load AWE Workbench Syntaxa and Discourse Feature \ - Module without supporting datafile {}".format(filepath) + "Trying to load AWE Workbench Syntax and Discourse Feature \ + Module without supporting datafile {}".format(self.STANCE_PERSPECTIVE_PATH) ) if not os.path.exists(self.MORPHOLEX_PATH): raise LexiconMissingError( - "Trying to load AWE Workbench Syntaxa and Discourse Feature \ - Module without supporting datafile {}".format(filepath) + "Trying to load AWE Workbench Syntax and Discourse Feature \ + Module without supporting datafile {}".format(self.MORPHOLEX_PATH) ) def load_lexicon(self, lang): @@ -4636,15 +4720,15 @@ def propagateNegation(self, doc: Doc): # neutral. if tok._.vwp_evaluation \ or tok._.vwp_hedge \ - or tok.text in doc._.assessments: - if tok._.polarity < 0 or tok._.sentiword < 0: - tok._.vwp_tone_ = min(tok._.polarity, tok._.sentiword) - elif tok._.polarity > 0 and tok._.sentiword > 0: - tok._.vwp_tone_ = max(tok._.polarity, tok._.sentiword) + or tok.text in doc._.blob.sentiment_assessments.assessments: + if tok._.blob.sentiment_assessments.polarity < 0 or tok._.sentiword < 0: + tok._.vwp_tone_ = min(tok._.blob.sentiment_assessments.polarity, tok._.sentiword) + elif tok._.blob.sentiment_assessments.polarity > 0 and tok._.sentiword > 0: + tok._.vwp_tone_ = max(tok._.blob.sentiment_assessments.polarity, tok._.sentiword) else: - tok._.vwp_tone_ = (tok._.polarity + tok._.sentiword) / 2 + tok._.vwp_tone_ = (tok._.blob.sentiment_assessments.polarity + tok._.sentiword) / 2 else: - tok._.vwp_tone_ = min(tok._.polarity, tok._.sentiword) + tok._.vwp_tone_ = min(tok._.blob.sentiment_assessments.polarity, tok._.sentiword) # rule order fixes to the tone variable are generally a bad idea, # but these are so common that fixing them gets rid of a lot of diff --git a/awe_components/setup/data.py b/awe_components/setup/data.py index cd5018c..626a5ea 100644 --- a/awe_components/setup/data.py +++ b/awe_components/setup/data.py @@ -1,12 +1,20 @@ #!/usr/bin/env python3 # Copyright 2022, Educational Testing Service +# TODO: using numpy>1.26.4 seems to cause errors when installing from awe_workbench. +# This is related to the desired model (subwordbert) that we use for AWE. +# Version-locking seems to keep this from failing. from spacy.cli.download import download import os -if __name__ == '__main__': + +def download_models(): print('Downloading Spacy and Coreferee Lexicons') download('en_core_web_sm') download('en_core_web_lg') download('en_core_web_trf') os.system("python3 -m coreferee install en") + + +if __name__ == '__main__': + download_models() diff --git a/extensions.txt b/extensions.txt new file mode 100644 index 0000000..8dc3ff2 --- /dev/null +++ b/extensions.txt @@ -0,0 +1,61 @@ +AWE_Info +all_cluster_info +assessments +blob +clusterInfo +clusterInfo_ +concrete_details +corefChainInfo +coref_chains +direct_speech_spans +has_governing_subject +intersentence_cohesions +main_cluster_spans +main_cluster_spans_ +main_ideas +main_ideas_ +negation_tokens +nominalReferences +polarity +prompt +prompt_ +prompt_language +prompt_language_ +prompt_related +prompt_related_ +propositional_attitudes_ +sentenceThemes +sentence_types +sliding_window_cohesions +subjectivity +supporting_details +supporting_details_ +supporting_ideas +supporting_ideas_ +syntacticDepthsOfRhemes +syntacticDepthsOfThemes +syntacticProfile +syntacticProfileNormed +syntacticVariety +tense_changes +token_vectors +transition_distances +transition_word_profile +transition_word_profile_ +transitions +vwp_allocentric +vwp_argumentation +vwp_character_traits +vwp_direct_speech +vwp_egocentric +vwp_emotion_states +vwp_interactive +vwp_perspective_spans +vwp_perspective_spans_ +vwp_propositional_attitudes +vwp_quoted +vwp_social_awareness +vwp_stance_markers +vwp_stance_markers_ +vwp_statements_of_fact +vwp_statements_of_opinion diff --git a/install.py b/install.py index 930943c..88242a0 100644 --- a/install.py +++ b/install.py @@ -1,6 +1,4 @@ from setuptools.command.install import install as _install -from setuptools.command.develop import develop as _develop -from distutils import log import os import subprocess import sys @@ -16,4 +14,5 @@ class AWEInstall(_install): ''' def run(self): _install.run(self) - subprocess.run(['python', 'awe_components/setup/data.py'], env=modified_env) + script_path = os.path.join(os.path.dirname(__file__), 'awe_components', 'setup', 'data.py') + subprocess.run([sys.executable, script_path], env=modified_env) diff --git a/setup.cfg b/setup.cfg index c5d2fee..3bd4d44 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,18 +25,18 @@ classifiers = Topic :: Text Processing :: Automated Writing Evaluation [options] +packages = find: include_package_data = True python_requires = >=3.9 cmdclass = install = install.AWEInstall install_requires = - awe_lexica + awe_lexica @ git+https://github.com/ArgLab/AWE_Lexica.git spacy - holmes_extractor - coreferee + coreferee @ git+https://github.com/Arglab/coreferee.git@latest_spacy rdflib spacytextblob - numpy + numpy==1.26.4 srsly wordfreq statistics @@ -44,13 +44,13 @@ install_requires = scikit-learn nltk aenum - websockets + websockets<14 websocket-client pytest clint pygtrie transformers - torch + torch==2.4.1 Path [options.package_data] * = *.cfg, *.csv, *.json, *.txt