Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
4f863b9
updated install data path
bradley-erickson Oct 9, 2024
9da9e4c
added find packages flag to setup
bradley-erickson Oct 9, 2024
5bbb434
added function to download so we can call it from code if needed
bradley-erickson Oct 9, 2024
1a419f4
Updated numpy dependency
duckduckdoof Nov 7, 2024
dd662ff
Merge branch 'main' of github.com:ArgLab/AWE_Components
duckduckdoof Nov 7, 2024
b80deba
Updated reference to lexica
duckduckdoof Nov 7, 2024
0d86877
Numpy version (#6)
duckduckdoof Dec 16, 2024
b483b2a
Merge branch 'main' of github.com:ArgLab/AWE_Components
duckduckdoof Jan 9, 2025
356d9a7
textblob features hotfix
duckduckdoof Jan 28, 2025
384cca0
Added spacytextblob doc feature name fix
duckduckdoof Feb 2, 2025
c8a874d
Added blob
duckduckdoof Feb 2, 2025
5487937
Added changes to config for testing
duckduckdoof Apr 3, 2025
709accb
Removed unnecessary dependencies, cleaned up existing
duckduckdoof Apr 3, 2025
a3b422b
Removed more unnecessary imports, fixed other import issues
duckduckdoof Apr 3, 2025
6422573
Removed even moar dependency issues, cleaned up imports
duckduckdoof Apr 3, 2025
a7b7189
Added back missing error imports, fixed variable decl
duckduckdoof Apr 3, 2025
f8c2260
Removed deprecated nameof
duckduckdoof Apr 3, 2025
d87fa76
Removed pesky import
duckduckdoof Apr 3, 2025
41758d3
Removed branch for lexica
duckduckdoof Apr 17, 2025
b0dd881
Updated readme on installation
duckduckdoof Apr 24, 2025
133150a
Merge pull request #13 from ArgLab/import_clean
duckduckdoof May 1, 2025
bb46ac4
Merge pull request #12 from ArgLab/textblobfix
duckduckdoof May 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 10 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,46 +6,27 @@ which can be installed into a Spacy pipeline. They annotate the Spacy
parse tree with additional attributes that make it easy to summarize
information about features of student writing.

Before You Install
------------

It is helpful to note that the use of AWE Components is best tested using [AWE_Workbench](https://github.com/ArgLab/AWE_Workbench), which utilizes the features defined in AWE Components. There are a series of automatic tests which can be run to verify or validate AWE Components; in addition, there are examples, a web server for parsing documents, and an interactive document highlighting tool for visualizing the document features which are derived from AWE Components.

See AWE Workbench's installations steps and verify that you'd want to use it instead of installing AWE Components directly.

Installation
------------

Set up Python 3.9. 3.8 will *not* work. If you wish to use `conda`:
Set up Python 3.11. If you wish to use `conda`:

conda create -n test_install python=3.9 pip
conda create -n test_install python=3.11 pip
pip install pip --upgrade
conda activate test_install

If you wish to use plain old `pip` with `virtualenvwrapper`:

mkvirtualenv awe_components --python=/usr/bin/python3.9
mkvirtualenv awe_components --python=/usr/bin/python3.11
pip install pip --upgrade

Install prerequisites:

[Holmes Extractor Expandable](https://github.com/ETS-Next-Gen/holmes-extractor-expandable):

git clone git@github.com:ETS-Next-Gen/holmes-extractor-expandable.git
cd holmes-extractor-expandable/~
pip install .

[AWE Language Tool](https://github.com/ETS-Next-Gen/AWE_LanguageTool):

git clone git@github.com:ETS-Next-Gen/AWE_LanguageTool.git
cd AWE_LanguageTool/
pip install .

[AWE Spell Correct](https://github.com/ETS-Next-Gen/AWE_SpellCorrect)

git clone git@github.com:ETS-Next-Gen/AWE_SpellCorrect.git
cd AWE_SpellCorrect/
pip install .

[AWE Lexica](https://github.com/ETS-Next-Gen/AWE_Lexica)

git clone git@github.com:ETS-Next-Gen/AWE_Lexica.git
cd AWE_Lexica/
pip install .

Then from the AWE Workbench Components directory:

pip install .
Expand Down
9 changes: 6 additions & 3 deletions awe_components/components/contentSegmentation.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#!/usr/bin/env python3
# Copyright 2022, Educational Testing Service

from .utility_functions import *
from .utility_functions import \
match_related_form, getRoot, \
in_past_tense_scope, newSpanEntry, \
AWE_Info

from operator import itemgetter
import spacy
from spacy.tokens import Token, Doc
from spacy.tokens import Doc
from spacy.language import Language
import wordfreq

Expand Down
12 changes: 3 additions & 9 deletions awe_components/components/lexicalClusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,18 @@
# Copyright 2022, Educational Testing Service

import re
import spacy
import srsly
import json
import wordfreq
import numpy as np
import os
from collections import OrderedDict

from scipy.spatial.distance import cosine
# Standard cosine distance metric

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

from spacy.tokens import Token, Doc
from spacy.language import Language

from .utility_functions import *
from ..errors import *
from .utility_functions import ResolveReference, all_zeros, AWE_Info

lang = "en"

Expand Down Expand Up @@ -480,7 +474,7 @@ def devword(token):
# flag assignClusterIDs to run
# by setting it to a non None value
token.doc._.clusterInfo_ = []
self.assignClusterIDs(token.doc)
assignClusterIDs(token.doc)
devlist = [token.text \
for token \
in developmentContentWords(token.doc)]
Expand Down
18 changes: 12 additions & 6 deletions awe_components/components/lexicalFeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,14 @@

import importlib.resources
import math
import numpy as np
import os
import re
from varname import nameof

# English dictionary. Contains information on senses associated with words
# (a lot more, but that's what we're currently using it for)
from nltk.corpus import wordnet
from scipy.spatial.distance import cosine # Standard cosine distance metric
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
from spacy.vocab import Vocab
from spacy.tokens import Doc, Token
import srsly
import statistics
# https://github.com/rspeer/wordfreq
Expand All @@ -41,7 +37,17 @@

import awe_lexica

from .utility_functions import * # <-- Paul, import only what you need here
from .utility_functions import \
setExtensionFunctions, alphanum_word, \
sylco, content_tags, \
ResolveReference, AWE_Info, \
possessive_or_determiner, personal_or_indefinite_pronoun, \
all_zeros, is_temporal, \
locative_adverbs, existential_there, \
major_locative_prepositions, all_locative_prepositions, \
loc_sverbs, loc_overbs, \
deictics

from ..errors import LexiconMissingError

def lexicon_path(lexicon):
Expand Down
42 changes: 20 additions & 22 deletions awe_components/components/syntaxDiscourseFeats.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,25 @@
#!/usr/bin/env python3
# Copyright 2022, Educational Testing Service

import math
import os
import srsly
from varname import nameof

from enum import Enum
from spacy.tokens import Doc, Span, Token
from spacy.tokens import Doc, Token
from spacy.language import Language

from scipy.spatial.distance import cosine
# Standard cosine distance metric

from .utility_functions import *
from ..errors import *
from importlib import resources

from nltk.corpus import wordnet
# English dictionary. Contains information on senses associated with words
# (a lot more, but that's what we're currently using it for)
from .utility_functions import \
setExtensionFunctions, AWE_Info, \
in_past_tense_scope, getRoot, \
temporalPhrase, newSpanEntry, \
adj_noun_or_verb, content_tags, \
possessive_or_determiner, ResolveReference, \
tensed_clause

from importlib import resources
from ..errors import LexiconMissingError

@Language.factory("syntaxdiscoursefeatures")
def SyntaxAndDiscourseFeatures(nlp, name):
Expand All @@ -45,21 +44,20 @@ class SyntaxAndDiscourseFeatDef(object):
) as filepath:
TRANSITION_CATEGORIES_PATH = filepath

datapaths = [{'pathname': nameof(TRANSITION_TERMS_PATH),
'value': TRANSITION_TERMS_PATH},
{'pathname': nameof(TRANSITION_CATEGORIES_PATH),
'value': TRANSITION_CATEGORIES_PATH}]

transition_terms = {}
transition_categories = {}

def package_check(self, lang):
for path in self.datapaths:
if not os.path.exists(path['value']):
raise LexiconMissingError(
"Trying to load AWE Workbench Lexicon Module \
without {name} datafile".format(name=path['pathname'])
)
if not os.path.exists(self.TRANSITION_TERMS_PATH):
raise LexiconMissingError(
"Trying to load AWE Workbench Syntax and Discourse Feature \
Module without supporting datafile {}".format(self.TRANSITION_TERMS_PATH)
)
if not os.path.exists(self.TRANSITION_CATEGORIES_PATH):
raise LexiconMissingError(
"Trying to load AWE Workbench Syntax and Discourse Feature \
Module without supporting datafile {}".format(self.TRANSITION_CATEGORIES_PATH)
)

def load_lexicons(self, lang):
self.transition_terms = \
Expand Down
13 changes: 13 additions & 0 deletions awe_components/components/utility_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2899,6 +2899,9 @@ def setTokenEntry(name, token, value):
# attribute. #
# TBD: put security check in for this #
#######################################
elif "blob" in name:
name = name.replace("blob.", "")
entry['value'] = getattr(token._.blob, name)
elif token.has_extension(name):
# TODO: Use Token.get_extension
# https://spacy.io/api/token
Expand Down Expand Up @@ -3384,6 +3387,16 @@ def AWE_Info(document: Doc,
raise AWE_Workbench_Error(
'Invalid indicator ' + indicator)

# QUICK FIX: spacytextblob no longer references polarity, subjectivity,
# nor assessments via doc._.X, but rather doc._.blob.X
# We are quickly fixing this problem in AWE_Info
if indicator == "polarity":
indicator = "blob.polarity"
elif indicator == "subjectivity":
indicator = "blob.subjectivity"
elif indicator == "assessments":
indicator = "blob.assessments"

if infoType == 'Doc':
baseInfo = createSpanInfo(indicator,
document)
Expand Down
116 changes: 100 additions & 16 deletions awe_components/components/viewpointFeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@

import os
import srsly
import imp

from enum import Enum
from collections import OrderedDict
from spacy.tokens import Doc, Span, Token
from spacy.language import Language

Expand All @@ -16,8 +13,95 @@
from nltk.corpus import wordnet
# (a lot more, but that's what we're currently using it for)

from .utility_functions import *
from ..errors import *
from .utility_functions import \
AWE_Info, \
absolute_degree, \
adjectival_complement_dependencies, \
adjectival_mod_dependencies , \
adjectival_predicates, \
animate_ent_type , \
auxiliary_dependencies, \
auxiliary_or_adverb, \
be_verbs , \
clausal_complements , \
clausal_modifier_dependencies , \
clausal_subject_or_complement, \
common_evaluation_adjective, \
common_hedge_word, \
complements , \
containsDistinctReference, \
content_pos , \
contracted_verb, \
contraction, \
core_temporal_preps , \
coreViewpointPredicate, \
dative_preps , \
demonstratives , \
elliptical_verb, \
emphatic_adjective, \
emphatic_adjective, \
emphatic_adverb, \
first_person_pronouns , \
function_word_tags , \
generalArgumentPredicate, \
general_complements_and_modifiers , \
generalViewpointPredicate, \
getDative, \
getLightVerbs, \
getLinkedNodes, \
getLogicalObject, \
getObject, \
getPrepObject, \
getRoot, \
getRoots, \
getSubject, \
getSubject, \
getTensedVerbHead, \
illocutionary_tag, \
inanimate_3sg_pronouns, \
indefinite_comparison, \
indefinite_pronoun , \
in_modal_scope, \
in_past_tense_scope, \
is_definite_nominal, \
isRoot, \
loose_clausal_dependencies , \
newSpanEntry, \
newTokenEntry, \
nominal_pos , \
nonhuman_ent_type , \
object_predicate_dependencies , \
object_predicate_dependencies, \
other_conversational_idioms, \
other_conversational_vocabulary, \
personal_or_indefinite_pronoun , \
personal_or_indefinite_pronoun , \
pos_degree_mod , \
prehead_modifiers2 , \
present_semimodals , \
private_mental_state_tag, \
quantifying_determiners, \
quotationMark, \
raising_complement, \
ResolveReference, \
rootTree, \
scanForAnimatePotentialAntecedents, \
second_person_pronouns , \
setExtensionFunctions, \
stance_adverb, \
stancePredicate, \
subject_dependencies , \
subject_or_object_nom , \
takesBareInfinitive, \
tensed_clause, \
third_person_pronouns , \
tough_complement, \
underlying_object_dependencies , \
verbal_mod_dependencies , \
verbal_pos , \
wh_question_word

from ..errors import LexiconMissingError
from importlib import resources


Expand Down Expand Up @@ -65,13 +149,13 @@ class ViewpointFeatureDef:
def package_check(self, lang):
if not os.path.exists(self.STANCE_PERSPECTIVE_PATH):
raise LexiconMissingError(
"Trying to load AWE Workbench Syntaxa and Discourse Feature \
Module without supporting datafile {}".format(filepath)
"Trying to load AWE Workbench Syntax and Discourse Feature \
Module without supporting datafile {}".format(self.STANCE_PERSPECTIVE_PATH)
)
if not os.path.exists(self.MORPHOLEX_PATH):
raise LexiconMissingError(
"Trying to load AWE Workbench Syntaxa and Discourse Feature \
Module without supporting datafile {}".format(filepath)
"Trying to load AWE Workbench Syntax and Discourse Feature \
Module without supporting datafile {}".format(self.MORPHOLEX_PATH)
)

def load_lexicon(self, lang):
Expand Down Expand Up @@ -4636,15 +4720,15 @@ def propagateNegation(self, doc: Doc):
# neutral.
if tok._.vwp_evaluation \
or tok._.vwp_hedge \
or tok.text in doc._.assessments:
if tok._.polarity < 0 or tok._.sentiword < 0:
tok._.vwp_tone_ = min(tok._.polarity, tok._.sentiword)
elif tok._.polarity > 0 and tok._.sentiword > 0:
tok._.vwp_tone_ = max(tok._.polarity, tok._.sentiword)
or tok.text in doc._.blob.sentiment_assessments.assessments:
if tok._.blob.sentiment_assessments.polarity < 0 or tok._.sentiword < 0:
tok._.vwp_tone_ = min(tok._.blob.sentiment_assessments.polarity, tok._.sentiword)
elif tok._.blob.sentiment_assessments.polarity > 0 and tok._.sentiword > 0:
tok._.vwp_tone_ = max(tok._.blob.sentiment_assessments.polarity, tok._.sentiword)
else:
tok._.vwp_tone_ = (tok._.polarity + tok._.sentiword) / 2
tok._.vwp_tone_ = (tok._.blob.sentiment_assessments.polarity + tok._.sentiword) / 2
else:
tok._.vwp_tone_ = min(tok._.polarity, tok._.sentiword)
tok._.vwp_tone_ = min(tok._.blob.sentiment_assessments.polarity, tok._.sentiword)

# rule order fixes to the tone variable are generally a bad idea,
# but these are so common that fixing them gets rid of a lot of
Expand Down
Loading