From 4eb21c8605dc6c7c52314ba83aa94003c30c12c6 Mon Sep 17 00:00:00 2001 From: "Hung Q. Vo" <66151263+Hung304-WBLEM@users.noreply.github.com> Date: Thu, 29 Sep 2022 05:35:45 +0700 Subject: [PATCH 1/2] Update spacy_component.py --- quickumls/spacy_component.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py index f64982f..8a79446 100644 --- a/quickumls/spacy_component.py +++ b/quickumls/spacy_component.py @@ -62,6 +62,16 @@ def __call__(self, doc): # add some custom metadata to the spans span._.similarity = ngram_match_dict['similarity'] span._.semtypes = ngram_match_dict['semtypes'] - doc.ents = list(doc.ents) + [span] - return doc \ No newline at end of file + # OLD: doc.ents = list(doc.ents) + [span] + # Using doc.spans["sc"] (SpanCategorizer) to solve the problem of overlapped tokens in nested NER for spacy. + # With doc.spans["sc"], all possible entities are stored without throwing errors. + doc.spans["sc"] = list(doc.spans["sc"]) + [span] + + # After storing all possible spans, we filter out overlapping spans before adding them to doc.ents. + # Here we remove overlapping spans using spacy.util.filter_spans + # When spans overlap, the rule is to prefer the first longest span over shorter ones. + for span in spacy.util.filter_spans(doc.spans["sc"]): + doc.ents = list(doc.ents) + [span] + + return doc From d3571694ccf2b5f678f9e55283f31ed0549241f6 Mon Sep 17 00:00:00 2001 From: "Hung Q. Vo" <66151263+Hung304-WBLEM@users.noreply.github.com> Date: Thu, 6 Oct 2022 06:50:41 +0700 Subject: [PATCH 2/2] Update spacy_component.py --- quickumls/spacy_component.py | 1 + 1 file changed, 1 insertion(+) diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py index 8a79446..6ecca24 100644 --- a/quickumls/spacy_component.py +++ b/quickumls/spacy_component.py @@ -44,6 +44,7 @@ def __call__(self, doc): matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax) # Convert QuickUMLS match objects into Spans + doc.spans['sc'] = [] for match in matches: # each match may match multiple ngrams for ngram_match_dict in match: