From 7f515b9b9b46e96362bf5ddc60c8ac1b945250aa Mon Sep 17 00:00:00 2001 From: Jan Trienes Date: Wed, 25 Oct 2023 21:14:53 -0500 Subject: [PATCH] Use pathlib Path in corpus_loader This should (hopefully) fix corpus loading on windows systems --- deidentify/dataset/corpus_loader.py | 30 ++++++++++++----------------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/deidentify/dataset/corpus_loader.py b/deidentify/dataset/corpus_loader.py index 50c60ca..370e774 100644 --- a/deidentify/dataset/corpus_loader.py +++ b/deidentify/dataset/corpus_loader.py @@ -1,30 +1,23 @@ -import glob -from os.path import basename, dirname, join, normpath, splitext +from pathlib import Path from deidentify.base import Corpus, Document from deidentify.dataset import brat -BASE_PATH = join(dirname(__file__), '../../data/corpus/') -DUMMY_CORPUS = join(BASE_PATH, 'dummy/') - -_CORPUS_DIRS = map(lambda dir: basename(normpath(dir)), glob.glob(BASE_PATH + '/*/')) -CORPUS_PATH = {c: join(BASE_PATH, c + '/') for c in _CORPUS_DIRS} - - -def get_basename(full_path): - return splitext(basename(full_path))[0] +BASE_PATH = Path(__file__).resolve().parent / '../../data/corpus' +DUMMY_CORPUS = BASE_PATH / 'dummy' +CORPUS_PATH = {p.name: p for p in BASE_PATH.glob('*/')} class CorpusLoader: @staticmethod - def _load_folder(path): - files = glob.glob(join(path, '*.ann')) + def _load_folder(path: Path): + files = path.glob('*.ann') files = sorted(files) documents = [] for file in files: - doc_name = get_basename(file) + doc_name = file.stem annotations, text = brat.load_brat_document(path, doc_name) doc = Document(name=doc_name, text=text, annotations=annotations) documents.append(doc) @@ -32,10 +25,11 @@ def _load_folder(path): return documents def load_corpus(self, path) -> Corpus: - corpus_name = basename(normpath(path)) + path = Path(path) + corpus_name = path.name - train = self._load_folder(join(path, 'train')) - test = self._load_folder(join(path, 'test')) - dev = self._load_folder(join(path, 'dev')) + train = self._load_folder(path / 'train') + test = self._load_folder(path / 'test') + dev = self._load_folder(path / 'dev') return Corpus(train=train, test=test, dev=dev, name=corpus_name)