diff --git a/quickumls/__init__.py b/quickumls/__init__.py index fcfd593..39056d1 100644 --- a/quickumls/__init__.py +++ b/quickumls/__init__.py @@ -1,3 +1,3 @@ -from .core import QuickUMLS -from .client import get_quickumls_client from .about import * +from .client import get_quickumls_client +from .core import QuickUMLS diff --git a/quickumls/about.py b/quickumls/about.py index dc0ac78..51e2ae4 100644 --- a/quickumls/about.py +++ b/quickumls/about.py @@ -3,10 +3,10 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # https://github.com/explosion/spaCy/blob/master/spacy/about.py -__title__ = 'quickumls' -__version__ = '1.4.0r1' -__author__ = 'Luca Soldaini' -__email__ = 'luca@ir.cs.georgetown.edu' -__license__ = 'MIT' +__title__ = "quickumls" +__version__ = "1.4.0r1" +__author__ = "Luca Soldaini" +__email__ = "luca@ir.cs.georgetown.edu" +__license__ = "MIT" __uri__ = "https://github.com/Georgetown-IR-Lab/QuickUMLS" -__copyright__ = '2014-2020, Georgetown University Information Retrieval Lab' +__copyright__ = "2014-2020, Georgetown University Information Retrieval Lab" diff --git a/quickumls/client.py b/quickumls/client.py index b40d645..b133f03 100644 --- a/quickumls/client.py +++ b/quickumls/client.py @@ -1,8 +1,8 @@ -from .network import MinimalClient from .core import QuickUMLS +from .network import MinimalClient -def get_quickumls_client(host='localhost', port=4645): - '''Return a client for a QuickUMLS server running on host at port''' +def get_quickumls_client(host="localhost", port=4645): + """Return a client for a QuickUMLS server running on host at port""" client = MinimalClient(QuickUMLS, host=host, port=port, buffersize=4096) return client diff --git a/quickumls/constants.py b/quickumls/constants.py index 02e66ba..4202b12 100644 --- a/quickumls/constants.py +++ b/quickumls/constants.py @@ -1,14 +1,28 @@ HEADERS_MRCONSO = [ - 'cui', 'lat', 'ts', 'lui', 'stt', 'sui', 'ispref', 'aui', 'saui', - 'scui', 'sdui', 'sab', 'tty', 'code', 'str', 'srl', 'suppress', 'cvf' -] -HEADERS_MRSTY = [ - 'cui', 'sty', 'hier' 'desc', 'sid', 'num' + "cui", + "lat", + "ts", + "lui", + "stt", + "sui", + "ispref", + "aui", + "saui", + "scui", + "sdui", + "sab", + "tty", + "code", + "str", + "srl", + "suppress", + "cvf", ] +HEADERS_MRSTY = ["cui", "sty", "hier", "desc", "sid", "num"] -NEGATIONS = {'none', 'non', 'neither', 'nor', 'no', 'not'} +NEGATIONS = {"none", "non", "neither", "nor", "no", "not"} -# The following is a list of all existing semtypes along with their name and some examples. +# The following is a list of all existing semtypes along with their name and some examples. # You can easily select the ones you need by commenting out the lines that are not relevant for your application. ACCEPTED_SEMTYPES = { @@ -21,7 +35,7 @@ # 'T190', # Anatomical Abnormality, ex.: Bronchial Fistula; Foot Deformities; Hyperostosis of skull # 'T017', # Anatomical Structure, ex.: Cadaver; Pharyngostome; Anatomic structures # 'T008', # Animal, ex.: Animals; Animals, Laboratory; Carnivore - 'T195', # Antibiotic, ex.: Antibiotics; bactericide; Thienamycins + "T195", # Antibiotic, ex.: Antibiotics; bactericide; Thienamycins # 'T194', # Archaeon, ex.: Thermoproteales; Haloferax volcanii; Methanospirillum # 'T007', # Bacterium, ex.: Acetobacter; Bacillus cereus; Cytophaga # 'T053', # Behavior, ex.: Homing Behavior; Sexuality; Habitat Selection @@ -30,10 +44,10 @@ # 'T091', # Biomedical Occupation or Discipline, ex.: Adolescent Medicine; Cellular Neurobiology; Dentistry # 'T122', # Biomedical or Dental Material, ex.: Acrylic Resins; Bone Cements; Dentifrices # 'T012', # Bird, ex.: Serinus; Ducks; Quail - 'T029', # Body Location or Region, ex.: Forehead; Sublingual Region; Base of skull structure - 'T023', # Body Part, Organ, or Organ Component, ex.: Aorta; Brain Stem; Structure of neck of femur + "T029", # Body Location or Region, ex.: Forehead; Sublingual Region; Base of skull structure + "T023", # Body Part, Organ, or Organ Component, ex.: Aorta; Brain Stem; Structure of neck of femur # 'T030', # Body Space or Junction, ex.: Knee joint; Greater sac of peritoneum; Synapses - 'T031', # Body Substance, ex.: Amniotic Fluid; saliva; Smegma + "T031", # Body Substance, ex.: Amniotic Fluid; saliva; Smegma # 'T022', # Body System, ex.: Endocrine system; Renin-angiotensin system; Reticuloendothelial System # 'T088', # Carbohydrate Sequence, ex.: Carbohydrate Sequence; Abnormal carbohydrate sequence # 'T025', # Cell, ex.: B-Lymphocytes; Dendritic Cells; Fibroblasts @@ -44,14 +58,14 @@ # 'T120', # Chemical Viewed Functionally, ex.: Aerosol Propellants; Detergents; Stabilizing Agents # 'T104', # Chemical Viewed Structurally, ex.: Ammonium Compounds; Cations; Sulfur Compounds # 'T185', # Classification, ex.: Anatomy (MeSH Category); Tumor Stage Classification; axis i - 'T201', # Clinical Attribute, ex.: Bone Density; heart rate; Range of Motion, Articular - 'T200', # Clinical Drug, ex.: Ranitidine 300 MG Oral Tablet [Zantac]; Aspirin 300 MG Delayed Release Oral + "T201", # Clinical Attribute, ex.: Bone Density; heart rate; Range of Motion, Articular + "T200", # Clinical Drug, ex.: Ranitidine 300 MG Oral Tablet [Zantac]; Aspirin 300 MG Delayed Release Oral # 'T077', # Conceptual Entity, ex.: Geographic Factors; Fractals; Secularism # 'T019', # Congenital Abnormality, ex.: Albinism; Cleft palate with cleft lip; Polydactyly of toes # 'T056', # Daily or Recreational Activity, ex.: Badminton; Dancing; Swimming - 'T060', # Diagnostic Procedure, ex.: Biopsy; Heart Auscultation; Magnetic Resonance Imaging - 'T047', # Disease or Syndrome, ex.: Diabetes Mellitus; Drug Allergy; Malabsorption Syndrome - 'T203', # Drug Delivery Device, ex.: Nordette 21 Day Pack; {7 (Terazosin 1 MG Oral Tablet) / 7 (Terazosin 2 MG + "T060", # Diagnostic Procedure, ex.: Biopsy; Heart Auscultation; Magnetic Resonance Imaging + "T047", # Disease or Syndrome, ex.: Diabetes Mellitus; Drug Allergy; Malabsorption Syndrome + "T203", # Drug Delivery Device, ex.: Nordette 21 Day Pack; {7 (Terazosin 1 MG Oral Tablet) / 7 (Terazosin 2 MG # 'T065', # Educational Activity, ex.: Academic Training; Family Planning Training; Preceptorship # 'T196', # Element, Ion, or Isotope, ex.: Carbon; Chromium Isotopes; Radioisotopes # 'T018', # Embryonic Structure, ex.: Blastoderm; Fetus; Neural Crest @@ -62,7 +76,7 @@ # 'T051', # Event, ex.: Anniversaries; Exposure to Mumps virus (event); Device Unattended # 'T050', # Experimental Model of Disease, ex.: Alloxan Diabetes; Liver Cirrhosis, Experimental; Transient Gene Knock-Out # 'T099', # Family Group, ex.: Daughter; Is an only child; Unmarried Fathers - 'T033', # Finding, ex.: Birth History; Downward displacement of diaphragm; Decreased glucose level + "T033", # Finding, ex.: Birth History; Downward displacement of diaphragm; Decreased glucose level # 'T013', # Fish, ex.: Bass; Salmonidae; Whitefish # 'T168', # Food, ex.: Beverages; Egg Yolk (Dietary); Ice Cream # 'T021', # Fully Formed Anatomical Structure, ex.: Entire body as a whole; Female human body; Set of parts of human body @@ -75,32 +89,32 @@ # 'T096', # Group, ex.: Focus Groups; jury; teams # 'T102', # Group Attribute, ex.: Family Size; Group Structure; Life Expectancy # 'T131', # Hazardous or Poisonous Substance, ex.: Carcinogens; Fumigant; Mutagens - 'T058', # Health Care Activity, ex.: ambulatory care services; Clinic Activities; Preventive Health Services + "T058", # Health Care Activity, ex.: ambulatory care services; Clinic Activities; Preventive Health Services # 'T093', # Health Care Related Organization, ex.: Centers for Disease Control and Prevention (U.S.); Halfway Houses; # 'T125', # Hormone, ex.: Enteric Hormones; thymic humoral factor; Prohormone # 'T016', # Human, ex.: Homo sapiens; jean piaget; Member of public # 'T068', # Human-caused Phenomenon or Process, ex.: Baby Boom; Cultural Evolution; Mass Media # 'T078', # Idea or Concept, ex.: Capitalism; Civil Rights; Ethics # 'T129', # Immunologic Factor, ex.: Antigens; Immunologic Factors; Blood group antigen P - 'T130', # Indicator, Reagent, or Diagnostic Aid, ex.: Fluorescent Dyes; Indicators and Reagents; India ink stain + "T130", # Indicator, Reagent, or Diagnostic Aid, ex.: Fluorescent Dyes; Indicators and Reagents; India ink stain # 'T055', # Individual Behavior, ex.: Assertiveness; Grooming; Risk-Taking - 'T037', # Injury or Poisoning, ex.: Accidental Falls; Carbon Monoxide Poisoning; Snake Bites + "T037", # Injury or Poisoning, ex.: Accidental Falls; Carbon Monoxide Poisoning; Snake Bites # 'T197', # Inorganic Chemical, ex.: Carbonic Acid; aluminum nitride; ferric citrate - 'T170', # Intellectual Product, ex.: Decision Support Techniques; Information Systems; Literature - 'T034', # Laboratory or Test Result, ex.: Blood Flow Velocity; Serum Calcium Level; Spinal Fluid Pressure - 'T059', # Laboratory Procedure, ex.: Blood Protein Electrophoresis; Crystallography; Radioimmunoassay + "T170", # Intellectual Product, ex.: Decision Support Techniques; Information Systems; Literature + "T034", # Laboratory or Test Result, ex.: Blood Flow Velocity; Serum Calcium Level; Spinal Fluid Pressure + "T059", # Laboratory Procedure, ex.: Blood Protein Electrophoresis; Crystallography; Radioimmunoassay # 'T171', # Language, ex.: Armenian language; braille; Bilingualism # 'T066', # Machine Activity, ex.: Computer Simulation; Equipment Failure; Natural Language Processing # 'T015', # Mammal, ex.: Ursidae Family; Hamsters; Macaca # 'T073', # Manufactured Object, ex.: car seat; Cooking and Eating Utensils; Goggles - 'T074', # Medical Device, ex.: Bone Screws; Headgear, Orthodontic; Compression Stockings - 'T048', # Mental or Behavioral Dysfunction, ex.: Agoraphobia; Cyclothymic Disorder; Frigidity - 'T041', # Mental Process, ex.: Anger; Auditory Fatigue; Avoidance Learning + "T074", # Medical Device, ex.: Bone Screws; Headgear, Orthodontic; Compression Stockings + "T048", # Mental or Behavioral Dysfunction, ex.: Agoraphobia; Cyclothymic Disorder; Frigidity + "T041", # Mental Process, ex.: Anger; Auditory Fatigue; Avoidance Learning # 'T063', # Molecular Biology Research Technique, ex.: Northern Blotting; Genetic Engineering; In Situ Hybridization # 'T044', # Molecular Function, ex.: Binding, Competitive; Electron Transport; Glycolysis # 'T085', # Molecular Sequence, ex.: Genetic Code; Homologous Sequences; Molecular Sequence # 'T070', # Natural Phenomenon or Process, ex.: Air Movements; Corrosion; Lightning (phenomenon) - 'T191', # Neoplastic Process, ex.: Abdominal Neoplasms; Bowen's Disease; Polyp in nasopharynx + "T191", # Neoplastic Process, ex.: Abdominal Neoplasms; Bowen's Disease; Polyp in nasopharynx # 'T114', # Nucleic Acid, Nucleoside, or Nucleotide, ex.: Cytosine Nucleotides; Guanine; Oligonucleotides # 'T086', # Nucleotide Sequence, ex.: Base Sequence; Direct Repeat; RNA Sequence # 'T090', # Occupation or Discipline, ex.: Aviation; Craniology; Ecology @@ -109,14 +123,14 @@ # 'T109', # Organic Chemical, ex.: Benzene Derivatives # 'T001', # Organism, ex.: Organism; Infectious agent; Heterotroph # 'T032', # Organism Attribute, ex.: Age; Birth Weight; Eye Color - 'T040', # Organism Function, ex.: Breeding; Hibernation; Motor Skills + "T040", # Organism Function, ex.: Breeding; Hibernation; Motor Skills # 'T092', # Organization, ex.: Labor Unions; United Nations; Boarding school - 'T046', # Pathologic Function, ex.: Inflammation; Shock; Thrombosis + "T046", # Pathologic Function, ex.: Inflammation; Shock; Thrombosis # 'T101', # Patient or Disabled Group, ex.: Amputees; Institutionalized Child; Mentally Ill Persons - 'T121', # Pharmacologic Substance, ex.: Antiemetics; Cardiovascular Agents; Alka-Seltzer - 'T067', # Phenomenon or Process, ex.: Disasters; Motor Traffic Accidents; Depolymerization + "T121", # Pharmacologic Substance, ex.: Antiemetics; Cardiovascular Agents; Alka-Seltzer + "T067", # Phenomenon or Process, ex.: Disasters; Motor Traffic Accidents; Depolymerization # 'T072', # Physical Object, ex.: Printed Media; Meteors; Physical object - 'T039', # Physiologic Function, ex.: Biorhythms; Hearing; Vasodilation + "T039", # Physiologic Function, ex.: Biorhythms; Hearing; Vasodilation # 'T002', # Plant, ex.: Aloe; Pollen; Helianthus species # 'T098', # Population Group, ex.: Asian Americans; Ethnic group; Adult Offenders # 'T097', # Professional or Occupational Group, ex.: Clergy; Demographers; Hospital Volunteers @@ -129,12 +143,12 @@ # 'T062', # Research Activity, ex.: Animal Experimentation; Biomedical Research; Experimental Replication # 'T075', # Research Device, ex.: Electrodes, Enzyme; DNA Microarray Chip; Particle Count and Size Analyzer # 'T095', # Self-help or Relief Organization, ex.: Alcoholics Anonymous; Charities - organization; Red Cross - 'T184', # Sign or Symptom, ex.: Dyspnea; Nausea; Pain + "T184", # Sign or Symptom, ex.: Dyspnea; Nausea; Pain # 'T054', # Social Behavior, ex.: Acculturation; Communication; Interpersonal Relations # 'T082', # Spatial Concept, ex.: Mandibular Rest Position; Lateral; Extrinsic # 'T167', # Substance, ex.: Air (substance); Fossils; Plastics # 'T079', # Temporal Concept, ex.: Birth Intervals; Half-Life; Postoperative Period - 'T061', # Therapeutic or Preventive Procedure, ex.: Cesarean section; Dermabrasion; Family psychotherapy + "T061", # Therapeutic or Preventive Procedure, ex.: Cesarean section; Dermabrasion; Family psychotherapy # 'T024', # Tissue, ex.: Cartilage; Endothelium; Epidermis # 'T010', # Vertebrate, ex.: Vertebrates; Gnathostomata vertebrate; Craniata # 'T005', # Virus, ex.: Coliphages; Echovirus; Parvoviridae @@ -142,47 +156,70 @@ } UNICODE_DASHES = { - u'\u002d', u'\u007e', u'\u00ad', u'\u058a', u'\u05be', u'\u1400', - u'\u1806', u'\u2010', u'\u2011', u'\u2010', u'\u2012', u'\u2013', - u'\u2014', u'\u2015', u'\u2053', u'\u207b', u'\u2212', u'\u208b', - u'\u2212', u'\u2212', u'\u2e17', u'\u2e3a', u'\u2e3b', u'\u301c', - u'\u3030', u'\u30a0', u'\ufe31', u'\ufe32', u'\ufe58', u'\ufe63', - u'\uff0d' + u"\u002d", + u"\u007e", + u"\u00ad", + u"\u058a", + u"\u05be", + u"\u1400", + u"\u1806", + u"\u2010", + u"\u2011", + u"\u2010", + u"\u2012", + u"\u2013", + u"\u2014", + u"\u2015", + u"\u2053", + u"\u207b", + u"\u2212", + u"\u208b", + u"\u2212", + u"\u2212", + u"\u2e17", + u"\u2e3a", + u"\u2e3b", + u"\u301c", + u"\u3030", + u"\u30a0", + u"\ufe31", + u"\ufe32", + u"\ufe58", + u"\ufe63", + u"\uff0d", } # language with missing value # will not have support for tokenization LANGUAGES = { - 'BAQ': None, # Basque - 'CHI': None, # Chinese - 'CZE': None, # Czech - 'DAN': 'danish', # Danish - 'DUT': 'dutch', # Dutch - 'ENG': 'english', # English - 'EST': None, # Estonian - 'FIN': 'finnish', # Finnish - 'FRE': 'french', # French - 'GER': 'german', # German - 'GRE': 'greek', # Greek - 'HEB': None, # Hebrew - 'HUN': 'hungarian', # Hungarian - 'ITA': 'italian', # Italian - 'JPN': None, # Japanese - 'KOR': None, # Korean - 'LAV': None, # Latvian - 'NOR': 'norwegian', # Norwegian - 'POL': 'polish', # Polish - 'POR': 'portoguese', # Portuguese - 'RUS': 'russian', # Russian - 'SCR': None, # Croatian - 'SPA': 'spanish', # Spanish - 'SWE': 'swedish', # Swedish - 'TUR': 'turkish', # Turkish + "BAQ": None, # Basque + "CHI": None, # Chinese + "CZE": None, # Czech + "DAN": "danish", # Danish + "DUT": "dutch", # Dutch + "ENG": "english", # English + "EST": None, # Estonian + "FIN": "finnish", # Finnish + "FRE": "french", # French + "GER": "german", # German + "GRE": "greek", # Greek + "HEB": None, # Hebrew + "HUN": "hungarian", # Hungarian + "ITA": "italian", # Italian + "JPN": None, # Japanese + "KOR": None, # Korean + "LAV": None, # Latvian + "NOR": "norwegian", # Norwegian + "POL": "polish", # Polish + "POR": "portoguese", # Portuguese + "RUS": "russian", # Russian + "SCR": None, # Croatian + "SPA": "spanish", # Spanish + "SWE": "swedish", # Swedish + "TUR": "turkish", # Turkish } -DOMAIN_SPECIFIC_STOPWORDS = { - 'time' -} +DOMAIN_SPECIFIC_STOPWORDS = {"time"} SPACY_LANGUAGE_MAP = { 'ENG': 'en_core_web_sm', diff --git a/quickumls/core.py b/quickumls/core.py index 7164d18..8a35579 100644 --- a/quickumls/core.py +++ b/quickumls/core.py @@ -1,33 +1,40 @@ # future statements for Python 2 compatibility -from __future__ import ( - unicode_literals, division, print_function, absolute_import) +from __future__ import absolute_import, division, print_function, unicode_literals + +import datetime # built in modules import os import sys -import datetime -from six.moves import xrange + +import nltk # installed modules import spacy -import nltk +from six.moves import xrange from unidecode import unidecode # project modules -from . import toolbox -from . import constants +from . import constants, toolbox class QuickUMLS(object): """The main class to interact with the matcher. """ + def __init__( - self, quickumls_fp, - overlapping_criteria='score', threshold=0.7, window=5, - similarity_name='jaccard', min_match_length=3, - accepted_semtypes=constants.ACCEPTED_SEMTYPES, - verbose=False, keep_uppercase=False, - spacy_component = False): + self, + quickumls_fp, + overlapping_criteria="score", + threshold=0.7, + window=5, + similarity_name="jaccard", + min_match_length=3, + accepted_semtypes=constants.ACCEPTED_SEMTYPES, + verbose=False, + keep_uppercase=False, + spacy_component=False, + ): """Instantiate QuickUMLS object This is the main interface through which text can be processed. @@ -61,24 +68,23 @@ def __init__( self.verbose = verbose - valid_criteria = {'length', 'score'} + valid_criteria = {"length", "score"} err_msg = ( '"{}" is not a valid overlapping_criteria. Choose ' - 'between {}'.format( - overlapping_criteria, ', '.join(valid_criteria) - ) + "between {}".format(overlapping_criteria, ", ".join(valid_criteria)) ) assert overlapping_criteria in valid_criteria, err_msg self.overlapping_criteria = overlapping_criteria - valid_similarities = {'dice', 'jaccard', 'cosine', 'overlap'} - err_msg = ('"{}" is not a valid similarity name. Choose between ' - '{}'.format(similarity_name, ', '.join(valid_similarities))) - assert not(valid_similarities in valid_similarities), err_msg + valid_similarities = {"dice", "jaccard", "cosine", "overlap"} + err_msg = '"{}" is not a valid similarity name. Choose between ' "{}".format( + similarity_name, ", ".join(valid_similarities) + ) + assert not (valid_similarities in valid_similarities), err_msg self.similarity_name = similarity_name - simstring_fp = os.path.join(quickumls_fp, 'umls-simstring.db') - cuisem_fp = os.path.join(quickumls_fp, 'cui-semtypes.db') + simstring_fp = os.path.join(quickumls_fp, "umls-simstring.db") + cuisem_fp = os.path.join(quickumls_fp, "cui-semtypes.db") self.valid_punct = constants.UNICODE_DASHES self.negations = constants.NEGATIONS @@ -88,57 +94,61 @@ def __init__( self.threshold = threshold self.min_match_length = min_match_length self.to_lowercase_flag = os.path.exists( - os.path.join(quickumls_fp, 'lowercase.flag') + os.path.join(quickumls_fp, "lowercase.flag") ) self.normalize_unicode_flag = os.path.exists( - os.path.join(quickumls_fp, 'normalize-unicode.flag') + os.path.join(quickumls_fp, "normalize-unicode.flag") ) self.keep_uppercase = keep_uppercase # Check whether data is installed with lowercase flag and QuickUMLS initiated with keeping uppercase words if self.to_lowercase_flag and self.keep_uppercase: - raise ValueError('Database is installed with lowercase flag and QuickUMLS is initiated with ' - 'keep_uppercase flag. This would prevent identifying concepts that contain all uppercase' - 'characters. Please reinstall data without --lowercase or run QuickUMLS without' - '--keep_uppercase.') + raise ValueError( + "Database is installed with lowercase flag and QuickUMLS is initiated with " + "keep_uppercase flag. This would prevent identifying concepts that contain all uppercase" + "characters. Please reinstall data without --lowercase or run QuickUMLS without" + "--keep_uppercase." + ) - language_fp = os.path.join(quickumls_fp, 'language.flag') + language_fp = os.path.join(quickumls_fp, "language.flag") # download stopwords if necessary try: nltk.corpus.stopwords.words() except LookupError: - nltk.download('stopwords') + nltk.download("stopwords") if os.path.exists(language_fp): with open(language_fp) as f: self.language_flag = f.read().strip() else: - self.language_flag = 'ENG' + self.language_flag = "ENG" if self.language_flag not in constants.LANGUAGES: raise ValueError('Language "{}" not supported'.format(self.language_flag)) elif constants.LANGUAGES[self.language_flag] is None: self._stopwords = set() - spacy_lang = 'XXX' + spacy_lang = "XXX" else: self._stopwords = set( nltk.corpus.stopwords.words(constants.LANGUAGES[self.language_flag]) ) spacy_lang = constants.SPACY_LANGUAGE_MAP[self.language_flag] - database_backend_fp = os.path.join(quickumls_fp, 'database_backend.flag') + database_backend_fp = os.path.join(quickumls_fp, "database_backend.flag") if os.path.exists(database_backend_fp): with open(database_backend_fp) as f: self._database_backend = f.read().strip() else: - print('[WARNING] This installation was created with QuickUMLS v.1.3 or earlier, ' - 'which does not support multiple database backends. For now, I\'ll ' - 'assume that leveldb was used as default, implicit assumption will ' - 'change in future versions of QuickUMLS. More info here: ' - 'https://github.com/Georgetown-IR-Lab/QuickUMLS/wiki/Migration-QuickUMLS-1.3-to-1.4', - file=sys.stderr) - self._database_backend = 'leveldb' + print( + "[WARNING] This installation was created with QuickUMLS v.1.3 or earlier, " + "which does not support multiple database backends. For now, I'll " + "assume that leveldb was used as default, implicit assumption will " + "change in future versions of QuickUMLS. More info here: " + "https://github.com/Georgetown-IR-Lab/QuickUMLS/wiki/Migration-QuickUMLS-1.3-to-1.4", + file=sys.stderr, + ) + self._database_backend = "leveldb" # domain specific stopwords self._stopwords = self._stopwords.union(constants.DOMAIN_SPECIFIC_STOPWORDS) @@ -158,19 +168,20 @@ def __init__( msg = ( 'Model for language "{}" is not downloaded. Please ' 'run "python -m spacy download {}" before launching ' - 'QuickUMLS' + "QuickUMLS" ).format( self.language_flag, - constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx') + constants.SPACY_LANGUAGE_MAP.get(self.language_flag, "xx"), ) raise OSError(msg) - self.ss_db = toolbox.SimstringDBReader( - simstring_fp, similarity_name, threshold - ) + self.ss_db = toolbox.SimstringDBReader(simstring_fp, similarity_name, threshold) self.cuisem_db = toolbox.CuiSemTypesDB( cuisem_fp, database_backend=self._database_backend ) + self.cuipref_db = toolbox.CuiPrefDB( + cuisem_fp, database_backend=self._database_backend + ) def get_info(self): """Computes a summary of the matcher options. @@ -194,44 +205,53 @@ def info(self): if self._info is None: self._info = { - 'threshold': self.threshold, - 'similarity_name': self.similarity_name, - 'window': self.window, - 'ngram_length': self.ngram_length, - 'min_match_length': self.min_match_length, - 'accepted_semtypes': sorted(self.accepted_semtypes), - 'negations': sorted(self.negations), - 'valid_punct': sorted(self.valid_punct) + "threshold": self.threshold, + "similarity_name": self.similarity_name, + "window": self.window, + "ngram_length": self.ngram_length, + "min_match_length": self.min_match_length, + "accepted_semtypes": sorted(self.accepted_semtypes), + "negations": sorted(self.negations), + "valid_punct": sorted(self.valid_punct), } return self._info def _is_valid_token(self, tok): - return not( - tok.is_punct or tok.is_space or - tok.pos_ == 'ADP' or tok.pos_ == 'DET' or tok.pos_ == 'CONJ' + return not ( + tok.is_punct + or tok.is_space + or tok.pos_ == "ADP" + or tok.pos_ == "DET" + or tok.pos_ == "CONJ" ) def _is_valid_start_token(self, tok): - return not( - tok.like_num or - (self._is_stop_term(tok) and tok.lemma_ not in self.negations) or - tok.pos_ == 'ADP' or tok.pos_ == 'DET' or tok.pos_ == 'CONJ' + return not ( + tok.like_num + or (self._is_stop_term(tok) and tok.lemma_ not in self.negations) + or tok.pos_ == "ADP" + or tok.pos_ == "DET" + or tok.pos_ == "CONJ" ) def _is_stop_term(self, tok): return tok.text in self._stopwords def _is_valid_end_token(self, tok): - return not( - tok.is_punct or tok.is_space or self._is_stop_term(tok) or - tok.pos_ == 'ADP' or tok.pos_ == 'DET' or tok.pos_ == 'CONJ' + return not ( + tok.is_punct + or tok.is_space + or self._is_stop_term(tok) + or tok.pos_ == "ADP" + or tok.pos_ == "DET" + or tok.pos_ == "CONJ" ) def _is_valid_middle_token(self, tok): return ( - not(tok.is_punct or tok.is_space) or - tok.is_bracket or - tok.text in self.valid_punct + not (tok.is_punct or tok.is_space) + or tok.is_bracket + or tok.text in self.valid_punct ) def _is_ok_semtype(self, target_semtypes): @@ -248,7 +268,7 @@ def _make_ngrams(self, sent): sent_length = len(sent) # do not include determiners inside a span - skip_in_span = {token.i for token in sent if token.pos_ == 'DET'} + skip_in_span = {token.i for token in sent if token.pos_ == "DET"} # invalidate a span if it includes any on these symbols invalid_mid_tokens = { @@ -273,11 +293,12 @@ def _make_ngrams(self, sent): # we take a shortcut if the token is the last one # in the sentence if ( - i + 1 == sent_length and # it's the last token - self._is_valid_end_token(tok) and # it's a valid end token - len(tok) >= self.min_match_length # it's of miminum length + i + 1 == sent_length + and self._is_valid_end_token(tok) # it's the last token + and len(tok) # it's a valid end token + >= self.min_match_length # it's of miminum length ): - yield(tok.idx, tok.idx + len(tok), tok.text) + yield (tok.idx, tok.idx + len(tok), tok.text) for j in xrange(i + 1, span_end): if compensate: @@ -296,11 +317,18 @@ def _make_ngrams(self, sent): continue yield ( - span.start_char, span.end_char, - ''.join(token.text_with_ws for token in span - if token.i not in skip_in_span).strip() + span.start_char, + span.end_char, + "".join( + token.text_with_ws + for token in span + if token.i not in skip_in_span + ).strip(), ) + def get_preferred_term(self, cui): + return self.cuipref_db.get(cui) + def _get_all_matches(self, ngrams): matches = [] for start, end, ngram in ngrams: @@ -317,11 +345,15 @@ def _get_all_matches(self, ngrams): # no match is found; so we convert to lowercase; # however, this is never needed if the string is lowercased # in the step above - if not self.to_lowercase_flag and ngram_normalized.isupper() and not self.keep_uppercase: + if ( + not self.to_lowercase_flag + and ngram_normalized.isupper() + and not self.keep_uppercase + ): ngram_normalized = ngram_normalized.lower() prev_cui = None - ngram_cands = list(self.ss_db.get(ngram_normalized)) + ngram_cands = list(set(self.ss_db.get(ngram_normalized))) ngram_matches = [] @@ -332,11 +364,11 @@ def _get_all_matches(self, ngrams): x=ngram_normalized, y=match, n=self.ngram_length, - similarity_name=self.similarity_name + similarity_name=self.similarity_name, ) if match_similarity == 0: - continue + continue for cui, semtypes, preferred in cuisem_match: @@ -344,23 +376,25 @@ def _get_all_matches(self, ngrams): continue if prev_cui is not None and prev_cui == cui: - if match_similarity > ngram_matches[-1]['similarity']: + if match_similarity > ngram_matches[-1]["similarity"]: ngram_matches.pop(-1) else: continue prev_cui = cui + preferred_term = self.cuipref_db.get(cui) ngram_matches.append( { - 'start': start, - 'end': end, - 'ngram': ngram, - 'term': toolbox.safe_unicode(match), - 'cui': cui, - 'similarity': match_similarity, - 'semtypes': semtypes, - 'preferred': preferred + "start": start, + "end": end, + "ngram": ngram, + "term": toolbox.safe_unicode(match), + "cui": cui, + "similarity": match_similarity, + "semtypes": semtypes, + "preferred": preferred, + "preferred_term": preferred_term, } ) @@ -368,23 +402,24 @@ def _get_all_matches(self, ngrams): matches.append( sorted( ngram_matches, - key=lambda m: m['similarity'] + m['preferred'], - reverse=True + key=lambda m: m["similarity"] + m["preferred"], + reverse=True, ) ) return matches @staticmethod def _select_score(match): - return (match[0]['similarity'], (match[0]['end'] - match[0]['start'])) + return (match[0]["similarity"], (match[0]["end"] - match[0]["start"])) @staticmethod def _select_longest(match): - return ((match[0]['end'] - match[0]['start']), match[0]['similarity']) + return ((match[0]["end"] - match[0]["start"]), match[0]["similarity"]) def _select_terms(self, matches): sort_func = ( - self._select_longest if self.overlapping_criteria == 'length' + self._select_longest + if self.overlapping_criteria == "length" else self._select_score ) @@ -394,7 +429,7 @@ def _select_terms(self, matches): final_matches_subset = [] for match in matches: - match_interval = (match[0]['start'], match[0]['end']) + match_interval = (match[0]["start"], match[0]["end"]) if match_interval not in intervals: final_matches_subset.append(match) intervals.append(match_interval) @@ -403,8 +438,7 @@ def _select_terms(self, matches): def _make_token_sequences(self, parsed): for i in range(len(parsed)): - for j in xrange( - i + 1, min(i + self.window, len(parsed)) + 1): + for j in xrange(i + 1, min(i + self.window, len(parsed)) + 1): span = parsed[i:j] if not self._is_longer_than_min(span): @@ -417,12 +451,12 @@ def _print_verbose_status(self, parsed, matches): return False print( - '[{}] {:,} extracted from {:,} tokens'.format( + "[{}] {:,} extracted from {:,} tokens".format( datetime.datetime.now().isoformat(), sum(len(match_group) for match_group in matches), - len(parsed) + len(parsed), ), - file=sys.stderr + file=sys.stderr, ) return True @@ -442,13 +476,13 @@ def match(self, text, best_match=True, ignore_syntax=False): TODO: Describe format """ - parsed = self.nlp(u'{}'.format(text)) - + parsed = self.nlp("{}".format(text)) + # pass in parsed spacy doc to get concept matches - matches = self._match(parsed) + matches = self._match(parsed, best_match, ignore_syntax) return matches - + def _match(self, doc, best_match=True, ignore_syntax=False): """Gathers ngram matches given a spaCy document object. @@ -464,7 +498,7 @@ def _match(self, doc, best_match=True, ignore_syntax=False): List: List of all matches in the text TODO: Describe format """ - + ngrams = None if ignore_syntax: ngrams = self._make_token_sequences(doc) @@ -477,5 +511,5 @@ def _match(self, doc, best_match=True, ignore_syntax=False): matches = self._select_terms(matches) self._print_verbose_status(doc, matches) - + return matches diff --git a/quickumls/install.py b/quickumls/install.py index 0c1998c..f09c01d 100644 --- a/quickumls/install.py +++ b/quickumls/install.py @@ -1,85 +1,74 @@ -from __future__ import unicode_literals, division, print_function +from __future__ import division, print_function, unicode_literals # built in modules import argparse import codecs import os -from six.moves import input import shutil import sys import time + +import spacy +import tqdm +from six.moves import input + try: from unidecode import unidecode except ImportError: pass - -# third party-dependencies -import spacy - - -# project modules -from .toolbox import countlines, CuiSemTypesDB, SimstringDBWriter, mkdir from .constants import HEADERS_MRCONSO, HEADERS_MRSTY, LANGUAGES, SPACY_LANGUAGE_MAP +from .toolbox import CuiPrefDB, CuiSemTypesDB, SimstringDBWriter, countlines, mkdir def get_semantic_types(path, headers): sem_types = {} - with codecs.open(path, encoding='utf-8') as f: - for i, ln in enumerate(f): - content = dict(zip(headers, ln.strip().split('|'))) - - sem_types.setdefault(content['cui'], []).append(content['sty']) + with codecs.open(path, encoding="utf-8") as f: + for line in tqdm.tqdm(f, total=countlines(path)): + content = dict(zip(headers, line.strip().split("|"))) + sem_types.setdefault(content["cui"], []).append(content["sty"]) return sem_types -def get_mrconso_iterator(path, headers, lang='ENG'): - with codecs.open(path, encoding='utf-8') as f: - for i, ln in enumerate(f): - content = dict(zip(headers, ln.strip().split('|'))) +def get_mrconso_iterator(path, headers, lang="ENG"): + with codecs.open(path, encoding="utf-8") as f: + for ln in f: + content = dict(zip(headers, ln.strip().split("|"))) - if content['lat'] != lang: + if content["lat"] != lang: continue yield content def extract_from_mrconso( - mrconso_path, mrsty_path, opts, - mrconso_header=HEADERS_MRCONSO, mrsty_header=HEADERS_MRSTY): + mrconso_path, + mrsty_path, + opts, + mrconso_header=HEADERS_MRCONSO, + mrsty_header=HEADERS_MRSTY, +): start = time.time() - print('loading semantic types...', end=' ') + print("loading semantic types...", end=" ") sys.stdout.flush() sem_types = get_semantic_types(mrsty_path, mrsty_header) - print('done in {:.2f} s'.format(time.time() - start)) + print("done in {:.2f} s".format(time.time() - start)) start = time.time() - mrconso_iterator = get_mrconso_iterator( - mrconso_path, mrconso_header, opts.language - ) + mrconso_iterator = get_mrconso_iterator(mrconso_path, mrconso_header, opts.language) total = countlines(mrconso_path) - processed = set() - i = 0 - - for content in mrconso_iterator: - i += 1 - - if i % 100000 == 0: - delta = time.time() - start - status = ( - '{:,} in {:.2f} s ({:.2%}, {:.1e} s / term)' - ''.format(i, delta, i / total, delta / i if i > 0 else 0) - ) - print(status) - - concept_text = content['str'].strip() - cui = content['cui'] - preferred = 1 if content['ispref'] == 'Y' else 0 + for content in tqdm.tqdm(mrconso_iterator, total=total): + concept_text = content["str"].strip() + cui = content["cui"] + preferred = 1 if content["ispref"] == "Y" else 0 + preferred_term = 1 if content["ts"] == "P" else 0 + preferred_string = 1 if content["stt"] == "PF" else 0 + source = content["sab"] if opts.lowercase: concept_text = concept_text.lower() @@ -87,37 +76,72 @@ def extract_from_mrconso( if opts.normalize_unicode: concept_text = unidecode(concept_text) - if (cui, concept_text) in processed: - continue - else: - processed.add((cui, concept_text)) + yield ( + concept_text, + cui, + sem_types[cui], + preferred, + preferred_term, + preferred_string, + source, + ) - yield (concept_text, cui, sem_types[cui], preferred) - delta = time.time() - start - status = ( - '\nCOMPLETED: {:,} in {:.2f} s ({:.1e} s / term)' - ''.format(i, delta, i / total, delta / i if i > 0 else 0) - ) - print(status) - - -def parse_and_encode_ngrams(extracted_it, simstring_dir, cuisty_dir, database_backend): +def parse_and_encode_ngrams( + extracted_it, simstring_dir, cuisty_dir, database_backend, sources +): # Create destination directories for the two databases mkdir(simstring_dir) mkdir(cuisty_dir) ss_db = SimstringDBWriter(simstring_dir) cuisty_db = CuiSemTypesDB(cuisty_dir, database_backend=database_backend) - - simstring_terms = set() - - for i, (term, cui, stys, preferred) in enumerate(extracted_it, start=1): - if term not in simstring_terms: - ss_db.insert(term) - simstring_terms.add(term) - - cuisty_db.insert(term, cui, stys, preferred) + cuipref_db = CuiPrefDB(cuisty_dir, database_backend=database_backend) + + prev_cui = None + pref_term = False + prev_term = None + cui_terms = set() + ss_db_terms = [] + cuisty_terms = [] + pref_terms = [] + cui_sources = set() + cuis = [] + for data in extracted_it: + term, cui, stys, preferred, preferred_term, preferred_string, source = data + cuis.append(cui) + if cui != prev_cui: + if prev_cui is not None: + if not pref_term: + raise RuntimeError( + f"did not find preferred term for cui {prev_cui}" + ) + if sources is None or cui_sources.intersection(sources): + for _term in ss_db_terms: + ss_db.insert(_term) + for _data in cuisty_terms: + cuisty_db.insert(*_data) + for _data in pref_terms: + cuipref_db.insert(*_data) + + prev_cui = cui + pref_term = False + cui_terms = set() + ss_db_terms = [] + cuisty_terms = [] + pref_terms = [] + cui_sources = set() + + if prev_term != term and term not in cui_terms: + ss_db_terms.append(term) + prev_term = term + cui_terms.add(term) + cui_sources.add(source) + + cuisty_terms.append((term, cui, stys, preferred)) + if preferred_term and preferred and preferred_string: + pref_terms.append((term, cui)) + pref_term = True def install_spacy(lang): @@ -128,38 +152,57 @@ def install_spacy(lang): if lang in SPACY_LANGUAGE_MAP: try: spacy.load(SPACY_LANGUAGE_MAP[lang]) - print(f'SpaCy is installed and avaliable for {lang}!') + print(f"SpaCy is installed and avaliable for {lang}!") except OSError: - print(f'SpaCy is not available! Attempting to download and install...') + print(f"SpaCy is not available! Attempting to download and install...") spacy.cli.download(SPACY_LANGUAGE_MAP[lang]) def parse_args(): ap = argparse.ArgumentParser() ap.add_argument( - 'umls_installation_path', - help=('Location of UMLS installation files (`MRCONSO.RRF` and ' - '`MRSTY.RRF` files)') + "umls_installation_path", + help=( + "Location of UMLS installation files (`MRCONSO.RRF` and " + "`MRSTY.RRF` files)" + ), + ) + ap.add_argument( + "destination_path", + help="Location where the necessary QuickUMLS files are installed", ) ap.add_argument( - 'destination_path', - help='Location where the necessary QuickUMLS files are installed' + "-L", + "--lowercase", + action="store_true", + help="Consider only lowercase version of tokens", ) ap.add_argument( - '-L', '--lowercase', action='store_true', - help='Consider only lowercase version of tokens' + "-U", + "--normalize-unicode", + action="store_true", + help="Normalize unicode strings to their closest ASCII representation", ) ap.add_argument( - '-U', '--normalize-unicode', action='store_true', - help='Normalize unicode strings to their closest ASCII representation' + "-d", + "--database-backend", + choices=("leveldb", "unqlite"), + default="unqlite", + help="KV database to use to store CUIs and semantic types", ) ap.add_argument( - '-d', '--database-backend', choices=('leveldb', 'unqlite'), default='unqlite', - help='KV database to use to store CUIs and semantic types' + "-E", + "--language", + default="ENG", + choices=LANGUAGES, + help="Extract concepts of the specified language", ) ap.add_argument( - '-E', '--language', default='ENG', choices=LANGUAGES, - help='Extract concepts of the specified language' + "-S", + "--sources", + default=None, + nargs="*", + help="List of sources for which a concept has to have at least one term from", ) opts = ap.parse_args() return opts @@ -171,63 +214,72 @@ def main(): install_spacy(opts.language) if not os.path.exists(opts.destination_path): - msg = ('Directory "{}" does not exists; should I create it? [y/N] ' - ''.format(opts.destination_path)) - create = input(msg).lower().strip() == 'y' + msg = 'Directory "{}" does not exists; should I create it? [y/N] ' "".format( + opts.destination_path + ) + create = input(msg).lower().strip() == "y" if create: os.makedirs(opts.destination_path) else: - print('Aborting.') + print("Aborting.") exit(1) if len(os.listdir(opts.destination_path)) > 0: - msg = ('Directory "{}" is not empty; should I empty it? [y/N] ' - ''.format(opts.destination_path)) - empty = input(msg).lower().strip() == 'y' + msg = 'Directory "{}" is not empty; should I empty it? [y/N] ' "".format( + opts.destination_path + ) + empty = input(msg).lower().strip() == "y" if empty: shutil.rmtree(opts.destination_path) os.mkdir(opts.destination_path) else: - print('Aborting.') + print("Aborting.") exit(1) if opts.normalize_unicode: try: unidecode except NameError: - err = ('`unidecode` is needed for unicode normalization' - 'please install it via the `[sudo] pip install ' - 'unidecode` command.') + err = ( + "`unidecode` is needed for unicode normalization" + "please install it via the `[sudo] pip install " + "unidecode` command." + ) print(err, file=sys.stderr) exit(1) - flag_fp = os.path.join(opts.destination_path, 'normalize-unicode.flag') - open(flag_fp, 'w').close() + flag_fp = os.path.join(opts.destination_path, "normalize-unicode.flag") + open(flag_fp, "w").close() if opts.lowercase: - flag_fp = os.path.join(opts.destination_path, 'lowercase.flag') - open(flag_fp, 'w').close() + flag_fp = os.path.join(opts.destination_path, "lowercase.flag") + open(flag_fp, "w").close() - flag_fp = os.path.join(opts.destination_path, 'language.flag') - with open(flag_fp, 'w') as f: + flag_fp = os.path.join(opts.destination_path, "language.flag") + with open(flag_fp, "w") as f: f.write(opts.language) - flag_fp = os.path.join(opts.destination_path, 'database_backend.flag') - with open(flag_fp, 'w') as f: + flag_fp = os.path.join(opts.destination_path, "database_backend.flag") + with open(flag_fp, "w") as f: f.write(opts.database_backend) - mrconso_path = os.path.join(opts.umls_installation_path, 'MRCONSO.RRF') - mrsty_path = os.path.join(opts.umls_installation_path, 'MRSTY.RRF') + mrconso_path = os.path.join(opts.umls_installation_path, "MRCONSO.RRF") + mrsty_path = os.path.join(opts.umls_installation_path, "MRSTY.RRF") mrconso_iterator = extract_from_mrconso(mrconso_path, mrsty_path, opts) - simstring_dir = os.path.join(opts.destination_path, 'umls-simstring.db') - cuisty_dir = os.path.join(opts.destination_path, 'cui-semtypes.db') + simstring_dir = os.path.join(opts.destination_path, "umls-simstring.db") + cuisty_dir = os.path.join(opts.destination_path, "cui-semtypes.db") - parse_and_encode_ngrams(mrconso_iterator, simstring_dir, cuisty_dir, - database_backend=opts.database_backend) + parse_and_encode_ngrams( + mrconso_iterator, + simstring_dir, + cuisty_dir, + database_backend=opts.database_backend, + sources=opts.sources, + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/quickumls/network.py b/quickumls/network.py index 425289d..5af3504 100644 --- a/quickumls/network.py +++ b/quickumls/network.py @@ -1,27 +1,27 @@ -'''Minimal client server through sockets -https://github.com/lucasoldaini/MinimalServer''' +"""Minimal client server through sockets +https://github.com/lucasoldaini/MinimalServer""" -import six -import sys -import time +import datetime +import inspect import math import socket -import inspect -import datetime +import sys import threading +import time + +import six try: - import SocketServer as socketserver import cPickle as pickle + import SocketServer as socketserver except ImportError: - import socketserver import pickle + import socketserver def pad_message(message, blocklength): """Pad a message so its length is a multiple of blocklength.""" - message_padded_length = ( - int(math.ceil(len(message) / blocklength)) * blocklength) + message_padded_length = int(math.ceil(len(message) / blocklength)) * blocklength padded_message = message.ljust(message_padded_length) return padded_message @@ -36,7 +36,7 @@ def receive_data_in_chunks(sock, buffersize): chunk = sock.recv(buffersize) chunks.append(chunk) - data = b''.join(chunks).strip() + data = b"".join(chunks).strip() return data @@ -46,7 +46,7 @@ def send_data_in_chunks(data, sock, buffersize): # We sent an empty chunk to signal that we're done # transmitting the message. - sock.send(b' ' * buffersize) + sock.send(b" " * buffersize) class MinimalServerHandler(socketserver.BaseRequestHandler): @@ -66,19 +66,21 @@ def handle(self): # fails, pass the error as response (the client will raise # the expection) try: - response = getattr( - self.server.served_object, method_name)(*args, **kwargs) + response = getattr(self.server.served_object, method_name)(*args, **kwargs) except Exception as ex: response = ex # send the response to the client in chunks send_data_in_chunks( pickle.dumps(response, protocol=self.server.pickle_protocol), - self.request, self.server.buffersize) + self.request, + self.server.buffersize, + ) class MinimalServer(socketserver.ThreadingMixIn, socketserver.TCPServer): """TCP Server""" + served_object = None buffersize = 2048 pickle_protocol = None @@ -87,8 +89,14 @@ class MinimalServer(socketserver.ThreadingMixIn, socketserver.TCPServer): class MinimalClient(object): """Minimal client to provide communication with the server""" - def __init__(self, target_class, host='localhost', port=4444, - buffersize=2048, pickle_protocol=None): + def __init__( + self, + target_class, + host="localhost", + port=4444, + buffersize=2048, + pickle_protocol=None, + ): """Initialize the client Args: target_class (object): the class to be served by the @@ -118,8 +126,9 @@ def __init__(self, target_class, host='localhost', port=4444, # bind public methods on target_class here for method_name, method in inspect.getmembers( - target_class, predicate=predicate): - if method_name.startswith('_'): + target_class, predicate=predicate + ): + if method_name.startswith("_"): continue setattr(self, method_name, self._func_req_wrapper(method_name)) @@ -131,8 +140,9 @@ def func_request(*args, **kwargs): """Send the request to the server""" # prepare the data - data = pickle.dumps((method_name, args, kwargs), - protocol=self.pickle_protocol) + data = pickle.dumps( + (method_name, args, kwargs), protocol=self.pickle_protocol + ) # open the socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) @@ -150,7 +160,7 @@ def func_request(*args, **kwargs): data = pickle.loads(response) except EOFError: # server sent an empty message" - msg = 'empty message received from the server.' + msg = "empty message received from the server." raise RuntimeError(msg) # raises an exception if an exception was raised by the @@ -164,8 +174,9 @@ def func_request(*args, **kwargs): return func_request -def run_server(served_object, host='localhost', - port=4444, buffersize=2048, pickle_protocol=None): +def run_server( + served_object, host="localhost", port=4444, buffersize=2048, pickle_protocol=None +): """Runs the server Args: served_obkect (object): the object to be served by the @@ -199,7 +210,7 @@ def run_server(served_object, host='localhost', server_thread.daemon = True server_thread.start() print( - '[{}] server running at {}:{} (press ^C to interrupt)'.format( + "[{}] server running at {}:{} (press ^C to interrupt)".format( datetime.datetime.now().isoformat(), host, port, server_thread.name ) ) @@ -209,9 +220,7 @@ def run_server(served_object, host='localhost', while True: time.sleep(1) except KeyboardInterrupt: - print( - '\n[{}] server stopped'.format(datetime.datetime.now().isoformat()) - ) + print("\n[{}] server stopped".format(datetime.datetime.now().isoformat())) # Terminate the server server.shutdown() diff --git a/quickumls/server.py b/quickumls/server.py index cc32c2c..29deee6 100644 --- a/quickumls/server.py +++ b/quickumls/server.py @@ -13,7 +13,7 @@ def run_quickumls_server(opts): window=opts.window, min_match_length=opts.min_match_length, verbose=opts.verbose, - keep_uppercase=opts.keep_uppercase + keep_uppercase=opts.keep_uppercase, ) run_server(matcher, host=opts.host, port=opts.port, buffersize=4096) @@ -21,63 +21,75 @@ def run_quickumls_server(opts): def parse_args(): ap = ArgumentParser( - prog='QuickUMLS server', + prog="QuickUMLS server", description=( - 'For more detailed instructions, visit ' - 'github.com/Georgetown-IR-Lab/QuickUMLS' - ) + "For more detailed instructions, visit " + "github.com/Georgetown-IR-Lab/QuickUMLS" + ), ) # required arguments ap.add_argument( - 'quickumls_fp', - help='directory where the QuickUMLS data files are installed.' + "quickumls_fp", help="directory where the QuickUMLS data files are installed." ) # server configuration + ap.add_argument("-H", "--host", default="localhost", help="host of the server") ap.add_argument( - '-H', '--host', default='localhost', - help='host of the server' - ) - ap.add_argument( - '-P', '--port', default=4645, type=int, - help='port on which the script responds' + "-P", "--port", default=4645, type=int, help="port on which the script responds" ) # QuickUMLS options ap.add_argument( - '-t', '--threshold', default=0.7, type=float, - help='minimum similarity value between strings' + "-t", + "--threshold", + default=0.7, + type=float, + help="minimum similarity value between strings", ) ap.add_argument( - '-o', '--overlapping_criteria', default='score', - choices=['score', 'length'], - help='criteria used to deal with overlapping concepts' + "-o", + "--overlapping_criteria", + default="score", + choices=["score", "length"], + help="criteria used to deal with overlapping concepts", ) ap.add_argument( - '-s', '--similarity_name', default='jaccard', - choices=['dice', 'jaccard', 'cosine', 'overlap'], - help='name of similarity to use' + "-s", + "--similarity_name", + default="jaccard", + choices=["dice", "jaccard", "cosine", "overlap"], + help="name of similarity to use", ) ap.add_argument( - '-w', '--window', default=5, type=int, - help='maximum number of tokens to consider for matching' + "-w", + "--window", + default=5, + type=int, + help="maximum number of tokens to consider for matching", ) ap.add_argument( - '-l', '--min-match-length', default=3, type=int, - help='minimum length of a match' + "-l", + "--min-match-length", + default=3, + type=int, + help="minimum length of a match", ) ap.add_argument( - '-v', '--verbose', action='store_true', - help='return verbose information while running' + "-v", + "--verbose", + action="store_true", + help="return verbose information while running", ) ap.add_argument( - '-u', '--keep_uppercase', action='store_true', - help='By default QuickUMLS converts all uppercase strings to lowercase' - '. This option disables that functionality, which makes QuickUMLS ' - 'useful for distinguishing acronyms from normal words. For this ' - 'the database should be installed without the -L option.' + "-u", + "--keep_uppercase", + action="store_true", + help="By default QuickUMLS converts all uppercase strings to lowercase" + ". This option disables that functionality, which makes QuickUMLS " + "useful for distinguishing acronyms from normal words. For this " + "the database should be installed without the -L option.", ) return ap.parse_args() @@ -87,5 +99,5 @@ def main(): run_quickumls_server(opts) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py index f64982f..f268c4e 100644 --- a/quickumls/spacy_component.py +++ b/quickumls/spacy_component.py @@ -1,14 +1,17 @@ import spacy -from spacy.tokens import Span from spacy.strings import StringStore +from spacy.tokens import Span -from .core import QuickUMLS from . import constants +from .core import QuickUMLS + class SpacyQuickUMLS(object): - name = 'QuickUMLS matcher' - - def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs): + name = "QuickUMLS matcher" + + def __init__( + self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs + ): """Instantiate SpacyQuickUMLS object This creates a QuickUMLS spaCy component which can be used in modular pipelines. @@ -21,47 +24,53 @@ def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kw ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py) """ - - self.quickumls = QuickUMLS(quickumls_fp, + + self.quickumls = QuickUMLS( + quickumls_fp, # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed # when we're using it as a component in a pipeline - spacy_component = True, - **kwargs) - + spacy_component=True, + **kwargs + ) + # save this off so that we can get vocab values of labels later self.nlp = nlp - + # keep these for matching self.best_match = best_match self.ignore_syntax = ignore_syntax # let's extend this with some proprties that we want - Span.set_extension('similarity', default = -1.0) - Span.set_extension('semtypes', default = -1.0) - + Span.set_extension("similarity", default=-1.0) + Span.set_extension("semtypes", default=-1.0) + def __call__(self, doc): # pass in the document which has been parsed to this point in the pipeline for ngrams and matches - matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax) - + matches = self.quickumls._match( + doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax + ) + # Convert QuickUMLS match objects into Spans for match in matches: # each match may match multiple ngrams for ngram_match_dict in match: - start_char_idx = int(ngram_match_dict['start']) - end_char_idx = int(ngram_match_dict['end']) - - cui = ngram_match_dict['cui'] + start_char_idx = int(ngram_match_dict["start"]) + end_char_idx = int(ngram_match_dict["end"]) + + cui = ngram_match_dict["cui"] # add the string to the spacy vocab self.nlp.vocab.strings.add(cui) # pull out the value cui_label_value = self.nlp.vocab.strings[cui] - + # char_span() creates a Span from these character indices # UMLS CUI should work well as the label here - span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value) + span = doc.char_span( + start_char_idx, end_char_idx, label=cui_label_value + ) # add some custom metadata to the spans - span._.similarity = ngram_match_dict['similarity'] - span._.semtypes = ngram_match_dict['semtypes'] + span._.similarity = ngram_match_dict["similarity"] + span._.semtypes = ngram_match_dict["semtypes"] doc.ents = list(doc.ents) + [span] - - return doc \ No newline at end of file + + return doc diff --git a/quickumls/toolbox.py b/quickumls/toolbox.py index 3c7ea34..a8ffbb8 100644 --- a/quickumls/toolbox.py +++ b/quickumls/toolbox.py @@ -1,20 +1,22 @@ -from __future__ import unicode_literals, division, print_function +from __future__ import division, print_function, unicode_literals +import os # build-in modules import re -import os -from functools import wraps -import six import unicodedata +from functools import wraps +from itertools import repeat, takewhile from string import punctuation -from itertools import takewhile, repeat -from six.moves import xrange +import leveldb # installed modules import numpy -import leveldb +import six +from six.moves import xrange + try: import unqlite + UNQLITE_AVAILABLE = True except ImportError: UNQLITE_AVAILABLE = False @@ -22,7 +24,6 @@ # project imports from quickumls_simstring import simstring - # Python version specific imports if six.PY2: import cPickle as pickle @@ -55,16 +56,16 @@ def safe_unicode(s): # in python 3, there no ambiguity on whether # a string is encoded in bytes format or not try: - s = u'%s' % s + s = "%s" % s except UnicodeDecodeError: - s = u'%s' % s.decode('utf-8') + s = "%s" % s.decode("utf-8") - return u'{}'.format(unicodedata.normalize('NFKD', s)) + return "{}".format(unicodedata.normalize("NFKD", s)) def prepare_string_for_db_input(s): if six.PY2: - return s.encode('utf-8') + return s.encode("utf-8") else: return s @@ -72,38 +73,60 @@ def prepare_string_for_db_input(s): def make_ngrams(s, n): # s = u'{t}{s}{t}'.format(s=safe_unicode(s), t=('$' * (n - 1))) n = len(s) if len(s) < n else n - return (s[i:i + n] for i in xrange(len(s) - n + 1)) + return (s[i : i + n] for i in xrange(len(s) - n + 1)) def get_similarity(x, y, n, similarity_name): if len(x) == 0 or len(y) == 0: # we define similarity between two strings # to be 0 if any of the two is empty. - return 0. + return 0.0 X, Y = set(make_ngrams(x, n)), set(make_ngrams(y, n)) intersec = len(X.intersection(Y)) - if similarity_name == 'dice': + if similarity_name == "dice": return 2 * intersec / (len(X) + len(Y)) - elif similarity_name == 'jaccard': + elif similarity_name == "jaccard": return intersec / (len(X) + len(Y) - intersec) - elif similarity_name == 'cosine': + elif similarity_name == "cosine": return intersec / numpy.sqrt(len(X) * len(Y)) - elif similarity_name == 'overlap': + elif similarity_name == "overlap": return intersec else: - msg = 'Similarity {} not recognized'.format(similarity_name) + msg = "Similarity {} not recognized".format(similarity_name) raise TypeError(msg) class SimpleTokenizer(object): def __init__(self, stopwords=None, min_length=1, split_sym=None): - if stopwords == 'default': + if stopwords == "default": stopwords = [ - 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', - 'from', 'has', 'he', 'in', 'is', 'its', 'of', 'on', 'or', - 'that', 'the', 'to', 'was ', 'were', 'will', 'with' + "a", + "an", + "and", + "are", + "as", + "at", + "be", + "by", + "for", + "from", + "has", + "he", + "in", + "is", + "its", + "of", + "on", + "or", + "that", + "the", + "to", + "was ", + "were", + "will", + "with", ] elif stopwords is None: stopwords = [] @@ -113,10 +136,10 @@ def __init__(self, stopwords=None, min_length=1, split_sym=None): if split_sym is None: split_sym = [] - split_sym = punctuation + ''.join(split_sym) + split_sym = punctuation + "".join(split_sym) self.min_length = min_length - self.re_tokenize = re.compile(r'&\w+;|\W+|_') + self.re_tokenize = re.compile(r"&\w+;|\W+|_") def tokenize(self, text, lower=True): """Tokenize text""" @@ -130,7 +153,8 @@ def tokenize_list(self, text, lower=True): if lower: text = text.lower() return [ - tok for tok in self.re_tokenize.split(text) + tok + for tok in self.re_tokenize.split(text) if len(tok) >= self.min_length and tok not in self.stopwords ] @@ -139,26 +163,23 @@ def db_key_encode(term): if six.PY2: return term else: - return term.encode('utf-8') + return term.encode("utf-8") def countlines(fn): """Count lines in fn. Slightly modified version of http://stackoverflow.com/a/27518377""" - with open(fn, 'rb') as f: - bufgen = takewhile( - lambda x: x, (f.read(1024 * 1024) for _ in repeat(None))) - ln = sum(buf.count(b'\n') for buf in bufgen) + with open(fn, "rb") as f: + bufgen = takewhile(lambda x: x, (f.read(1024 * 1024) for _ in repeat(None))) + ln = sum(buf.count(b"\n") for buf in bufgen) return ln class SimstringDBWriter(object): def __init__(self, path): - if not(os.path.exists(path)) or not(os.path.isdir(path)): - err_msg = ( - '"{}" does not exists or it is not a directory.' - ).format(path) + if not (os.path.exists(path)) or not (os.path.isdir(path)): + err_msg = ('"{}" does not exists or it is not a directory.').format(path) raise IOError(err_msg) else: try: @@ -167,10 +188,10 @@ def __init__(self, path): pass self.db = simstring.writer( - prepare_string_for_db_input( - os.path.join(path, 'umls-terms.simstring') - ), - 3, False, True + prepare_string_for_db_input(os.path.join(path, "umls-terms.simstring")), + 3, + False, + True, ) def insert(self, term): @@ -180,16 +201,12 @@ def insert(self, term): class SimstringDBReader(object): def __init__(self, path, similarity_name, threshold): - if not(os.path.exists(path)) or not(os.path.isdir(path)): - err_msg = ( - '"{}" does not exists or it is not a directory.' - ).format(path) + if not (os.path.exists(path)) or not (os.path.isdir(path)): + err_msg = ('"{}" does not exists or it is not a directory.').format(path) raise IOError(err_msg) self.db = simstring.reader( - prepare_string_for_db_input( - os.path.join(path, 'umls-terms.simstring') - ) + prepare_string_for_db_input(os.path.join(path, "umls-terms.simstring")) ) self.db.measure = getattr(simstring, similarity_name) self.db.threshold = threshold @@ -213,8 +230,7 @@ def _is_overlapping_intervals(self, a, b): def __contains__(self, interval): return any( - self._is_overlapping_intervals(interval, other) - for other in self.intervals + self._is_overlapping_intervals(interval, other) for other in self.intervals ) def append(self, interval): @@ -222,32 +238,31 @@ def append(self, interval): class CuiSemTypesDB(object): - def __init__(self, path, database_backend='leveldb'): + def __init__(self, path, database_backend="leveldb"): if not (os.path.exists(path) or os.path.isdir(path)): - err_msg = ( - '"{}" is not a valid directory').format(path) + err_msg = ('"{}" is not a valid directory').format(path) raise IOError(err_msg) - if database_backend == 'unqlite': + if database_backend == "unqlite": assert UNQLITE_AVAILABLE, ( - 'You selected unqlite as database backend, but it is not ' - 'installed. Please install it via `pip install unqlite`' + "You selected unqlite as database backend, but it is not " + "installed. Please install it via `pip install unqlite`" ) - self.cui_db = unqlite.UnQLite(os.path.join(path, 'cui.unqlite')) + self.cui_db = unqlite.UnQLite(os.path.join(path, "cui.unqlite")) self.cui_db_put = self.cui_db.store self.cui_db_get = self.cui_db.fetch - self.semtypes_db = unqlite.UnQLite(os.path.join(path, 'semtypes.unqlite')) + self.semtypes_db = unqlite.UnQLite(os.path.join(path, "semtypes.unqlite")) self.semtypes_db_put = self.semtypes_db.store self.semtypes_db_get = self.semtypes_db.fetch - elif database_backend == 'leveldb': - self.cui_db = leveldb.LevelDB(os.path.join(path, 'cui.leveldb')) + elif database_backend == "leveldb": + self.cui_db = leveldb.LevelDB(os.path.join(path, "cui.leveldb")) self.cui_db_put = self.cui_db.Put self.cui_db_get = self.cui_db.Get - self.semtypes_db = leveldb.LevelDB(os.path.join(path, 'semtypes.leveldb')) + self.semtypes_db = leveldb.LevelDB(os.path.join(path, "semtypes.leveldb")) self.semtypes_db_put = self.semtypes_db.Put self.semtypes_db_get = self.semtypes_db.Get else: - raise ValueError(f'database_backend {database_backend} not recognized') + raise ValueError(f"database_backend {database_backend} not recognized") def has_term(self, term): term = prepare_string_for_db_input(safe_unicode(term)) @@ -274,9 +289,7 @@ def insert(self, term, cui, semtypes, is_preferred): try: self.semtypes_db_get(db_key_encode(cui)) except KeyError: - self.semtypes_db_put( - db_key_encode(cui), pickle.dumps(set(semtypes)) - ) + self.semtypes_db_put(db_key_encode(cui), pickle.dumps(set(semtypes))) def get(self, term): term = prepare_string_for_db_input(safe_unicode(term)) @@ -286,11 +299,56 @@ def get(self, term): cuis = set() matches = ( - ( - cui, - pickle.loads(self.semtypes_db_get(db_key_encode(cui))), - is_preferred - ) + (cui, pickle.loads(self.semtypes_db_get(db_key_encode(cui))), is_preferred) for cui, is_preferred in cuis ) return matches + + +class CuiPrefDB(object): + def __init__(self, path, database_backend="leveldb"): + if not (os.path.exists(path) or os.path.isdir(path)): + err_msg = ('"{}" is not a valid directory').format(path) + raise IOError(err_msg) + + if database_backend == "unqlite": + assert UNQLITE_AVAILABLE, ( + "You selected unqlite as database backend, but it is not " + "installed. Please install it via `pip install unqlite`" + ) + self.cui_db = unqlite.UnQLite(os.path.join(path, "cui_pref.unqlite")) + self.cui_db_put = self.cui_db.store + self.cui_db_get = self.cui_db.fetch + elif database_backend == "leveldb": + self.cui_db = leveldb.LevelDB(os.path.join(path, "cui_pref.leveldb")) + self.cui_db_put = self.cui_db.Put + self.cui_db_get = self.cui_db.Get + else: + raise ValueError(f"database_backend {database_backend} not recognized") + + def has_cui(self, cui): + cui = prepare_string_for_db_input(safe_unicode(cui)) + try: + self.cui_db_get(db_key_encode(cui)) + return True + except KeyError: + return False + + def insert(self, term, cui): + term = prepare_string_for_db_input(safe_unicode(term)) + cui = prepare_string_for_db_input(safe_unicode(cui)) + + if self.has_cui(cui): + db_term = pickle.loads(self.cui_db_get(db_key_encode(cui))) + raise RuntimeError( + f"DB shouldn't contain preferred term for CUI twice: {cui}, {term}, {db_term}" + ) + + self.cui_db_put(db_key_encode(cui), pickle.dumps(term)) + + def get(self, cui): + cui = prepare_string_for_db_input(safe_unicode(cui)) + try: + return pickle.loads(self.cui_db_get(db_key_encode(cui))) + except KeyError: + return diff --git a/setup.py b/setup.py index 35e23cd..6c163b4 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,9 @@ -from setuptools import setup, find_packages -import os +import contextlib import io +import os import sys -import contextlib + +from setuptools import find_packages, setup PACKAGES = find_packages() @@ -22,21 +23,20 @@ def chdir(new_dir): def setup_package(): root = os.path.abspath(os.path.dirname(__file__)) - with open('README.md') as reader: + with open("README.md") as reader: readme = reader.read() - with open('requirements.txt') as f: + with open("requirements.txt") as f: requirements = f.read().splitlines() dependency_links = [] i = 0 while i < len(requirements): - if requirements[i].startswith('https://'): + if requirements[i].startswith("https://"): dependency_links.append(requirements.pop(i)) else: i += 1 - # From https://github.com/explosion/spaCy/blob/master/setup.py with chdir(root): with io.open(os.path.join(root, "quickumls", "about.py"), encoding="utf8") as f: @@ -44,19 +44,19 @@ def setup_package(): exec(f.read(), about) setup( - name=about['__title__'], - version=about['__version__'], + name=about["__title__"], + version=about["__version__"], description=( - 'QuickUMLS is a tool for fast, unsupervised biomedical ' - 'concept extraction from medical text' + "QuickUMLS is a tool for fast, unsupervised biomedical " + "concept extraction from medical text" ), packages=PACKAGES, long_description=readme, - long_description_content_type='text/markdown', - author=about['__author__'], - author_email=about['__email__'], - url='https://github.com/Georgetown-IR-Lab/QuickUMLS', - license=about['__license__'], + long_description_content_type="text/markdown", + author=about["__author__"], + author_email=about["__email__"], + url="https://github.com/Georgetown-IR-Lab/QuickUMLS", + license=about["__license__"], install_requires=requirements, dependency_links=dependency_links, classifiers=[ @@ -67,8 +67,9 @@ def setup_package(): "Development Status :: 5 - Production/Stable", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Bio-Informatics", - ] + ], ) -if __name__ == '__main__': + +if __name__ == "__main__": setup_package()