diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py index f64982f..6ecca24 100644 --- a/quickumls/spacy_component.py +++ b/quickumls/spacy_component.py @@ -44,6 +44,7 @@ def __call__(self, doc): matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax) # Convert QuickUMLS match objects into Spans + doc.spans['sc'] = [] for match in matches: # each match may match multiple ngrams for ngram_match_dict in match: @@ -62,6 +63,16 @@ def __call__(self, doc): # add some custom metadata to the spans span._.similarity = ngram_match_dict['similarity'] span._.semtypes = ngram_match_dict['semtypes'] - doc.ents = list(doc.ents) + [span] - return doc \ No newline at end of file + # OLD: doc.ents = list(doc.ents) + [span] + # Using doc.spans["sc"] (SpanCategorizer) to solve the problem of overlapped tokens in nested NER for spacy. + # With doc.spans["sc"], all possible entities are stored without throwing errors. + doc.spans["sc"] = list(doc.spans["sc"]) + [span] + + # After storing all possible spans, we filter out overlapping spans before adding them to doc.ents. + # Here we remove overlapping spans using spacy.util.filter_spans + # When spans overlap, the rule is to prefer the first longest span over shorter ones. + for span in spacy.util.filter_spans(doc.spans["sc"]): + doc.ents = list(doc.ents) + [span] + + return doc