From 310d92b696a3fef0509b776eaff009b3040960a1 Mon Sep 17 00:00:00 2001 From: burgersmoke Date: Thu, 6 Aug 2020 23:29:44 -0600 Subject: [PATCH 1/3] Implementation to allow QuickUMLS to be used as a component within a spacy pipeline. This can be used as an entity matcher for UMLS concepts in other modular spacy pipelines. This is already being used in an operational capacity for syndromic surveillance. --- quickumls/core.py | 32 +++++++------ quickumls/spacy_component.py | 90 ++++++++++++++++++++++++++++++++++++ spacy_example_pipeline.py | 29 ++++++++++++ 3 files changed, 138 insertions(+), 13 deletions(-) create mode 100644 quickumls/spacy_component.py create mode 100644 spacy_example_pipeline.py diff --git a/quickumls/core.py b/quickumls/core.py index fa9955b..e84bdf6 100644 --- a/quickumls/core.py +++ b/quickumls/core.py @@ -26,7 +26,8 @@ def __init__( overlapping_criteria='score', threshold=0.7, window=5, similarity_name='jaccard', min_match_length=3, accepted_semtypes=constants.ACCEPTED_SEMTYPES, - verbose=False, keep_uppercase=False): + verbose=False, keep_uppercase=False, + spacy_component = False): """Instantiate QuickUMLS object This is the main interface through which text can be processed. @@ -146,18 +147,23 @@ def __init__( self.accepted_semtypes = accepted_semtypes - try: - self.nlp = spacy.load(spacy_lang) - except OSError: - msg = ( - 'Model for language "{}" is not downloaded. Please ' - 'run "python -m spacy download {}" before launching ' - 'QuickUMLS' - ).format( - self.language_flag, - constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx') - ) - raise OSError(msg) + # if this is not being executed as as spacy component, then it must be standalone + if spacy_component: + # In this case, the pipeline is external to this current class + self.nlp = None + else: + try: + self.nlp = spacy.load(spacy_lang) + except OSError: + msg = ( + 'Model for language "{}" is not downloaded. Please ' + 'run "python -m spacy download {}" before launching ' + 'QuickUMLS' + ).format( + self.language_flag, + constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx') + ) + raise OSError(msg) self.ss_db = toolbox.SimstringDBReader( simstring_fp, similarity_name, threshold diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py new file mode 100644 index 0000000..336f796 --- /dev/null +++ b/quickumls/spacy_component.py @@ -0,0 +1,90 @@ +import spacy +from spacy.tokens import Span +from spacy.strings import StringStore + +from .core import QuickUMLS +from . import constants + +class SpacyQuickUMLS(object): + name = 'QuickUMLS matcher' + + def __init__(self, nlp, quickumls_path, + # these are all params that were from match() in quickumls but since we want to construct this + # and then make its behavior consistent as a component, we'll set them here: + best_match = True, ignore_syntax = False, verbose = False, + # these below are the same as in quickumls.py (so let's pass them through as our wrapper) + overlapping_criteria='score', threshold=0.7, window=5, + similarity_name='jaccard', min_match_length=3, + accepted_semtypes=constants.ACCEPTED_SEMTYPES): + + self.quickumls = QuickUMLS(quickumls_path, + overlapping_criteria=overlapping_criteria, threshold=threshold, window=window, + similarity_name=similarity_name, min_match_length=min_match_length, + accepted_semtypes=accepted_semtypes, + # By default, the QuickUMLS objects creates its own internal spacy pipeline but we do not need that in this case + spacy_component = True, + verbose = verbose) + + # save this off so that we can get vocab values of labels later + self.nlp = nlp + + self.best_match = best_match + self.ignore_syntax = ignore_syntax + self.verbose = verbose + + # let's extend this with some proprties that we want + Span.set_extension('similarity', default = -1.0) + Span.set_extension('semtypes', default = -1.0) + + if self.verbose: + print('Accepted semtypes : [{0}]'.format(accepted_semtypes)) + + def __call__(self, doc): + # much of this is a re-write of match() in quickumls.py + # however, the changes include: + # receiving an incoming doc (rather than parsing at calling time) + # transforming matches into Spans as per spaCy custom Entity code example + + ngrams = None + # pass in the incoming doc which has already been tokenized (ready for ngrams) + if self.ignore_syntax: + ngrams = self.quickumls._make_token_sequences(doc) + else: + ngrams = self.quickumls._make_ngrams(doc) + + # perform the matching + matches = self.quickumls._get_all_matches(ngrams) + + if self.verbose: + print('Total matches before best match: [{0}]'.format(len(matches))) + + if self.best_match: + matches = self.quickumls._select_terms(matches) + + if self.verbose: + print('Total matches after best match: [{0}]'.format(len(matches))) + + self.quickumls._print_verbose_status(doc, matches) + + # Here's another change: convert match objects into Spans + for match in matches: + # each match may match multiple ngrams + for ngram_match_dict in match: + start_char_idx = int(ngram_match_dict['start']) + end_char_idx = int(ngram_match_dict['end']) + + cui = ngram_match_dict['cui'] + # add the string + self.nlp.vocab.strings.add(cui) + # pull out the value + cui_label_value = self.nlp.vocab.strings[cui] + + # char_span() created a Span from the character indices + # UMLS CUI should work well as the label here + span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value) + # add some custom metadata + span._.similarity = ngram_match_dict['similarity'] + span._.semtypes = ngram_match_dict['semtypes'] + doc.ents = list(doc.ents) + [span] + + return doc \ No newline at end of file diff --git a/spacy_example_pipeline.py b/spacy_example_pipeline.py new file mode 100644 index 0000000..4d4ced1 --- /dev/null +++ b/spacy_example_pipeline.py @@ -0,0 +1,29 @@ +import spacy + +import quickumls +from quickumls.spacy_component import SpacyQuickUMLS + +# setup a spacy pipeline which includes QuickUMLS as a component and nothing else +nlp = spacy.load('en_core_web_sm') + +nlp.remove_pipe('tagger') +nlp.remove_pipe('parser') + +# TODO -- change this from a hard coded path to a small chunk of UMLS that can be legally distributed +quickumls_path = r'C:\temp_quickumls\SNOMED_RXNORM_CPT_lowercase' + +threshold = 0.8 + +quickumls_component = SpacyQuickUMLS(nlp, quickumls_path, threshold = threshold) + +nlp.add_pipe(quickumls_component) + +doc = nlp('Pt c/o shortness of breath, chest pain, nausea, vomiting, diarrrhea') + +for ent in doc.ents: + print('Entity text : {}'.format(ent.text)) + print('Label : {}'.format(ent.label_)) + print('Similarity : {}'.format(ent._.similarity)) + print('Semtypes : {}'.format(ent._.semtypes)) + +print('DONE with spacy/QuickUMLS demo') \ No newline at end of file From b06b8820c27bfdd1ef98c37e887de60f1a63e9fc Mon Sep 17 00:00:00 2001 From: burgersmoke Date: Mon, 10 Aug 2020 22:17:54 -0600 Subject: [PATCH 2/3] Factoring QuickUMLS class slightly to minimize duplicated code between QuickUMLS and SpacyQuickUMLS. Fixed keyword arguments to QuickUMLS when creating a spacy component. Added documentation as well. Removing previous standalone example Python file and instead adding an example of a QuickUMLS spacy pipleline to the README. --- README.md | 23 ++++++++++++ quickumls/core.py | 30 +++++++++++++-- quickumls/spacy_component.py | 71 ++++++++++++------------------------ spacy_example_pipeline.py | 29 --------------- 4 files changed, 73 insertions(+), 80 deletions(-) delete mode 100644 spacy_example_pipeline.py diff --git a/README.md b/README.md index 3e42585..8ed022e 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,29 @@ Set `best_match` to `False` if you want to return overlapping candidates, `ignor If the matcher throws a warning during initialization, read [this page](https://github.com/Georgetown-IR-Lab/QuickUMLS/wiki/Migration-QuickUMLS-1.3-to-1.4) to learn why and how to stop it from doing so. +## spaCy pipeline component + +QuickUMLS can be used for standalone processing but it can also be use as a component in a modular spaCy pipeline. This follows traditional spaCy handling of concepts to be entity objects added to the Document object. These entity objects contain the CUI, similarity score and Semantic Types in the spacy "underscore" object. + +Adding QuickUMLS as a component in a pipeline can be done as follows: + +```python +from quickumls.spacy_component import SpacyQuickUMLS + +# common English pipeline +nlp = spacy.load('en_core_web_sm') + +quickumls_component = SpacyQuickUMLS(nlp, 'PATH_TO_QUICKUMLS_DATA') +nlp.add_pipe(quickumls_component) + +doc = nlp('Pt c/o shortness of breath, chest pain, nausea, vomiting, diarrrhea') + +for ent in doc.ents: + print('Entity text : {}'.format(ent.text)) + print('Label (UMLS CUI) : {}'.format(ent.label_)) + print('Similarity : {}'.format(ent._.similarity)) + print('Semtypes : {}'.format(ent._.semtypes)) +``` ## Server / Client Support diff --git a/quickumls/core.py b/quickumls/core.py index e84bdf6..7164d18 100644 --- a/quickumls/core.py +++ b/quickumls/core.py @@ -443,17 +443,39 @@ def match(self, text, best_match=True, ignore_syntax=False): """ parsed = self.nlp(u'{}'.format(text)) + + # pass in parsed spacy doc to get concept matches + matches = self._match(parsed) + return matches + + def _match(self, doc, best_match=True, ignore_syntax=False): + """Gathers ngram matches given a spaCy document object. + + [extended_summary] + + Args: + text (Document): spaCy Document object to be used for extracting ngrams + + best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True. + ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False + + Returns: + List: List of all matches in the text + TODO: Describe format + """ + + ngrams = None if ignore_syntax: - ngrams = self._make_token_sequences(parsed) + ngrams = self._make_token_sequences(doc) else: - ngrams = self._make_ngrams(parsed) + ngrams = self._make_ngrams(doc) matches = self._get_all_matches(ngrams) if best_match: matches = self._select_terms(matches) - self._print_verbose_status(parsed, matches) - + self._print_verbose_status(doc, matches) + return matches diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py index 336f796..f64982f 100644 --- a/quickumls/spacy_component.py +++ b/quickumls/spacy_component.py @@ -8,65 +8,42 @@ class SpacyQuickUMLS(object): name = 'QuickUMLS matcher' - def __init__(self, nlp, quickumls_path, - # these are all params that were from match() in quickumls but since we want to construct this - # and then make its behavior consistent as a component, we'll set them here: - best_match = True, ignore_syntax = False, verbose = False, - # these below are the same as in quickumls.py (so let's pass them through as our wrapper) - overlapping_criteria='score', threshold=0.7, window=5, - similarity_name='jaccard', min_match_length=3, - accepted_semtypes=constants.ACCEPTED_SEMTYPES): + def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs): + """Instantiate SpacyQuickUMLS object + + This creates a QuickUMLS spaCy component which can be used in modular pipelines. + This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts. + + Args: + nlp: Existing spaCy pipeline. This is needed to update the vocabulary with UMLS CUI values + quickumls_fp (str): Path to QuickUMLS data + best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True. + ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False + **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py) + """ - self.quickumls = QuickUMLS(quickumls_path, - overlapping_criteria=overlapping_criteria, threshold=threshold, window=window, - similarity_name=similarity_name, min_match_length=min_match_length, - accepted_semtypes=accepted_semtypes, - # By default, the QuickUMLS objects creates its own internal spacy pipeline but we do not need that in this case + self.quickumls = QuickUMLS(quickumls_fp, + # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed + # when we're using it as a component in a pipeline spacy_component = True, - verbose = verbose) + **kwargs) # save this off so that we can get vocab values of labels later self.nlp = nlp + # keep these for matching self.best_match = best_match self.ignore_syntax = ignore_syntax - self.verbose = verbose # let's extend this with some proprties that we want Span.set_extension('similarity', default = -1.0) Span.set_extension('semtypes', default = -1.0) - if self.verbose: - print('Accepted semtypes : [{0}]'.format(accepted_semtypes)) - def __call__(self, doc): - # much of this is a re-write of match() in quickumls.py - # however, the changes include: - # receiving an incoming doc (rather than parsing at calling time) - # transforming matches into Spans as per spaCy custom Entity code example - - ngrams = None - # pass in the incoming doc which has already been tokenized (ready for ngrams) - if self.ignore_syntax: - ngrams = self.quickumls._make_token_sequences(doc) - else: - ngrams = self.quickumls._make_ngrams(doc) - - # perform the matching - matches = self.quickumls._get_all_matches(ngrams) - - if self.verbose: - print('Total matches before best match: [{0}]'.format(len(matches))) - - if self.best_match: - matches = self.quickumls._select_terms(matches) - - if self.verbose: - print('Total matches after best match: [{0}]'.format(len(matches))) - - self.quickumls._print_verbose_status(doc, matches) + # pass in the document which has been parsed to this point in the pipeline for ngrams and matches + matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax) - # Here's another change: convert match objects into Spans + # Convert QuickUMLS match objects into Spans for match in matches: # each match may match multiple ngrams for ngram_match_dict in match: @@ -74,15 +51,15 @@ def __call__(self, doc): end_char_idx = int(ngram_match_dict['end']) cui = ngram_match_dict['cui'] - # add the string + # add the string to the spacy vocab self.nlp.vocab.strings.add(cui) # pull out the value cui_label_value = self.nlp.vocab.strings[cui] - # char_span() created a Span from the character indices + # char_span() creates a Span from these character indices # UMLS CUI should work well as the label here span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value) - # add some custom metadata + # add some custom metadata to the spans span._.similarity = ngram_match_dict['similarity'] span._.semtypes = ngram_match_dict['semtypes'] doc.ents = list(doc.ents) + [span] diff --git a/spacy_example_pipeline.py b/spacy_example_pipeline.py deleted file mode 100644 index 4d4ced1..0000000 --- a/spacy_example_pipeline.py +++ /dev/null @@ -1,29 +0,0 @@ -import spacy - -import quickumls -from quickumls.spacy_component import SpacyQuickUMLS - -# setup a spacy pipeline which includes QuickUMLS as a component and nothing else -nlp = spacy.load('en_core_web_sm') - -nlp.remove_pipe('tagger') -nlp.remove_pipe('parser') - -# TODO -- change this from a hard coded path to a small chunk of UMLS that can be legally distributed -quickumls_path = r'C:\temp_quickumls\SNOMED_RXNORM_CPT_lowercase' - -threshold = 0.8 - -quickumls_component = SpacyQuickUMLS(nlp, quickumls_path, threshold = threshold) - -nlp.add_pipe(quickumls_component) - -doc = nlp('Pt c/o shortness of breath, chest pain, nausea, vomiting, diarrrhea') - -for ent in doc.ents: - print('Entity text : {}'.format(ent.text)) - print('Label : {}'.format(ent.label_)) - print('Similarity : {}'.format(ent._.similarity)) - print('Semtypes : {}'.format(ent._.semtypes)) - -print('DONE with spacy/QuickUMLS demo') \ No newline at end of file From 0dda67b4cef810c694a8807bf4e93295157dd25e Mon Sep 17 00:00:00 2001 From: burgersmoke Date: Thu, 3 Sep 2020 16:13:32 -0600 Subject: [PATCH 3/3] Fixing issue #60 to ensure that SpacyQuickUMLS cannot add entity spans which overlap on a token. Also added some documentation to the class and README. --- README.md | 2 +- quickumls/spacy_component.py | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8ed022e..a0de114 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ If the matcher throws a warning during initialization, read [this page](https:// ## spaCy pipeline component -QuickUMLS can be used for standalone processing but it can also be use as a component in a modular spaCy pipeline. This follows traditional spaCy handling of concepts to be entity objects added to the Document object. These entity objects contain the CUI, similarity score and Semantic Types in the spacy "underscore" object. +QuickUMLS can be used for standalone processing but it can also be use as a component in a modular spaCy pipeline. This follows traditional spaCy handling of concepts to be entity objects added to the Document object. These entity objects contain the CUI, similarity score and Semantic Types in the spacy "underscore" object. Note that this implementation follows a [known spacy convention](https://github.com/explosion/spaCy/issues/3608) that entity Spans cannot overlap on a single token. To prevent token overlap, matches are ranked according to the `overlapping_criteria` supplied so that overlap of any tokens will be prioritized by this order. Adding QuickUMLS as a component in a pipeline can be done as follows: diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py index f64982f..036aef3 100644 --- a/quickumls/spacy_component.py +++ b/quickumls/spacy_component.py @@ -13,12 +13,13 @@ def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kw This creates a QuickUMLS spaCy component which can be used in modular pipelines. This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts. + Note that this implementation follows and enforces a known spacy convention that entity Spans cannot overlap on a single token. Args: nlp: Existing spaCy pipeline. This is needed to update the vocabulary with UMLS CUI values quickumls_fp (str): Path to QuickUMLS data best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True. - ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False + ignore_syntax (bool, optional): Whether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py) """ @@ -43,6 +44,15 @@ def __call__(self, doc): # pass in the document which has been parsed to this point in the pipeline for ngrams and matches matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax) + # NOTE: Spacy spans do not allow overlapping tokens, so we prevent the overlap here + # For more information, see: https://github.com/explosion/spaCy/issues/3608 + tokens_in_ents_set = set() + + # let's track any other entities which may have been attached via upstream components + for ent in doc.ents: + for token_index in range(ent.start, ent.end): + tokens_in_ents_set.add(token_index) + # Convert QuickUMLS match objects into Spans for match in matches: # each match may match multiple ngrams @@ -59,6 +69,17 @@ def __call__(self, doc): # char_span() creates a Span from these character indices # UMLS CUI should work well as the label here span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value) + + # before we add this, let's make sure that this entity does not overlap any tokens added thus far + candidate_token_indexes = set(range(span.start, span.end)) + + # check the intersection and skip this if there is any overlap + if len(tokens_in_ents_set.intersection(candidate_token_indexes)) > 0: + continue + + # track this to make sure we do not introduce overlap later + tokens_in_ents_set.update(candidate_token_indexes) + # add some custom metadata to the spans span._.similarity = ngram_match_dict['similarity'] span._.semtypes = ngram_match_dict['semtypes']