diff --git a/.gitignore b/.gitignore index 0a0d7ed..6350bb3 100644 --- a/.gitignore +++ b/.gitignore @@ -161,10 +161,12 @@ cython_debug/ #Other - potentially vs code *.DS_Store -**/.DS_Store +*/.DS_Store #MLFlow etc **/artifacts/model **/artifacts/** -**/mlruns/** \ No newline at end of file +**/mlruns/** +Chapter08/.DS_Store +Chapter09/.DS_Store diff --git a/Chapter03/automl/Dockerfile b/Chapter03/automl/Dockerfile new file mode 100644 index 0000000..741bca3 --- /dev/null +++ b/Chapter03/automl/Dockerfile @@ -0,0 +1,39 @@ +FROM ubuntu:20.04 + +# install linux packages +RUN apt-get update + +# Set the locale +# workaround for https://github.com/automl/auto-sklearn/issues/867 +RUN apt-get -y install locales +RUN touch /usr/share/locale/locale.alias +RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && locale-gen +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US:en +ENV LC_ALL=en_US.UTF-8 + +# set environment variables to only use one core +RUN export OPENBLAS_NUM_THREADS=1 +RUN export MKL_NUM_THREADS=1 +RUN export BLAS_NUM_THREADS=1 +RUN export OMP_NUM_THREADS=1 + +# install build requirements +RUN apt install -y python3-dev python3-pip +RUN pip3 install --upgrade setuptools +RUN apt install -y build-essential + +RUN apt install -y swig + +# Copy the checkout autosklearn version for installation +#ADD . /auto-sklearn/ + +# Upgrade pip then install dependencies +RUN pip3 install --upgrade pip + +# Install +RUN pip3 install "auto-sklearn[test, examples]" + +COPY autosklearn_example.py autosklearn_example.py + +CMD ["python3", "autosklearn_example.py"] \ No newline at end of file diff --git a/Chapter03/automl/README.md b/Chapter03/automl/README.md new file mode 100644 index 0000000..c9337a3 --- /dev/null +++ b/Chapter03/automl/README.md @@ -0,0 +1,10 @@ +# Autosklearn example +There are known issues around installing auto-sklearn on MacOS and Windows systems so I have set this up to run in a docker container. + +To run this example just run the following (this assumes you have already run ```conda env create -f mlewp-chapter03.yml```): + +```bash +docker build -t autosklearn . +docker run autosklearn +``` + diff --git a/Chapter03/automl/autosklearn_example.py b/Chapter03/automl/autosklearn_example.py index 2e7c577..8a00fe3 100644 --- a/Chapter03/automl/autosklearn_example.py +++ b/Chapter03/automl/autosklearn_example.py @@ -2,15 +2,20 @@ import sklearn.datasets import sklearn.metrics import autosklearn.classification +from sklearn.datasets import load_wine +from sklearn.model_selection import train_test_split automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=60, per_run_time_limit=30 ) +X, y = load_wine(return_X_y=True) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) + automl.fit(X_train, y_train, dataset_name='wine') print(automl.show_models()) print(automl.sprint_statistics()) predictions = automl.predict(X_test) -sklearn.metrics.accuracy_score(y_test, predictions) +print(sklearn.metrics.accuracy_score(y_test, predictions)) \ No newline at end of file diff --git a/Chapter03/automl/run_autosklearn_example.sh b/Chapter03/automl/run_autosklearn_example.sh new file mode 100644 index 0000000..5509468 --- /dev/null +++ b/Chapter03/automl/run_autosklearn_example.sh @@ -0,0 +1,2 @@ +docker build -t autosklearn_image . +docker run -it autosklearn_image \ No newline at end of file diff --git a/Chapter03/features/feature-engineering.py b/Chapter03/features/feature-engineering.py index c68c0a7..594ecf7 100644 --- a/Chapter03/features/feature-engineering.py +++ b/Chapter03/features/feature-engineering.py @@ -9,7 +9,6 @@ # Make a 70/30 train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, - test_size=0.30, test_size=0.30, random_state=42) diff --git a/Chapter03/hyperparameter-opt/optuna_example.py b/Chapter03/hyperparameter-opt/optuna_example.py index 31a4397..01c8f47 100644 --- a/Chapter03/hyperparameter-opt/optuna_example.py +++ b/Chapter03/hyperparameter-opt/optuna_example.py @@ -53,5 +53,4 @@ def objective(trial, n_folds, X, y): study = optuna.create_study(direction='minimize') study.optimize(partial(objective, n_folds=n_folds, X=X_train, y=y_train), n_trials=16) - print(study.best_trial.params) - print(stu) + print(study.best_trial.params) \ No newline at end of file diff --git a/Chapter03/mlewp-chapter03.yml b/Chapter03/mlewp-chapter03.yml index 9ec4025..3f5602f 100644 --- a/Chapter03/mlewp-chapter03.yml +++ b/Chapter03/mlewp-chapter03.yml @@ -2,232 +2,210 @@ name: mlewp-chapter03 channels: - conda-forge dependencies: - - appnope=0.1.3 - - asttokens=2.2.1 - - backcall=0.2.0 - - backports=1.0 - - backports.functools_lru_cache=1.6.4 + - aiohappyeyeballs=2.4.4 + - aiohttp=3.11.11 + - aiosignal=1.3.2 + - alembic=1.14.1 + - annotated-types=0.7.0 + - async-timeout=5.0.1 + - attrs=25.1.0 + - aws-c-auth=0.8.1 + - aws-c-cal=0.8.1 + - aws-c-common=0.10.6 + - aws-c-compression=0.3.0 + - aws-c-event-stream=0.5.0 + - aws-c-http=0.9.2 + - aws-c-io=0.15.3 + - aws-c-mqtt=0.11.0 + - aws-c-s3=0.7.9 + - aws-c-sdkutils=0.2.2 + - aws-checksums=0.2.2 + - aws-crt-cpp=0.29.9 + - aws-sdk-cpp=1.11.489 + - azure-core-cpp=1.14.0 + - azure-identity-cpp=1.10.0 + - azure-storage-blobs-cpp=12.13.0 + - azure-storage-common-cpp=12.8.0 + - azure-storage-files-datalake-cpp=12.12.0 + - bcrypt=4.2.1 + - blinker=1.9.0 + - brotli=1.1.0 + - brotli-bin=1.1.0 + - brotli-python=1.1.0 - bzip2=1.0.8 - - ca-certificates=2022.12.7 - - comm=0.1.3 - - debugpy=1.6.7 - - decorator=5.1.1 - - executing=1.2.0 - - importlib-metadata=6.6.0 - - importlib_metadata=6.6.0 - - ipykernel=6.22.0 - - ipython=8.13.1 - - jedi=0.18.2 - - jupyter_client=8.2.0 - - jupyter_core=5.3.0 - - libcxx=16.0.2 + - c-ares=1.34.4 + - ca-certificates=2025.1.31 + - cachetools=5.5.1 + - certifi=2024.12.14 + - cffi=1.17.1 + - charset-normalizer=3.4.1 + - click=8.1.8 + - cloudpickle=3.1.1 + - colorama=0.4.6 + - colorlog=6.9.0 + - contourpy=1.3.1 + - cryptography=44.0.0 + - cycler=0.12.1 + - databricks-sdk=0.43.0 + - deprecated=1.2.18 + - docker-py=7.1.0 + - entrypoints=0.4 + - flask=3.1.0 + - fonttools=4.55.8 + - freetype=2.12.1 + - frozenlist=1.5.0 + - gflags=2.2.2 + - gitdb=4.0.12 + - gitpython=3.1.44 + - glog=0.7.1 + - google-auth=2.38.0 + - graphene=3.4.3 + - graphql-core=3.2.6 + - graphql-relay=3.2.0 + - greenlet=3.1.1 + - gunicorn=23.0.0 + - h2=4.2.0 + - hpack=4.1.0 + - hyperframe=6.1.0 + - icu=75.1 + - idna=3.10 + - importlib-metadata=8.6.1 + - itsdangerous=2.2.0 + - jinja2=3.1.5 + - joblib=1.4.2 + - kiwisolver=1.4.7 + - krb5=1.21.3 + - lcms2=2.16 + - lerc=4.0.0 + - libabseil=20240722.0 + - libarrow=18.1.0 + - libarrow-acero=18.1.0 + - libarrow-dataset=18.1.0 + - libarrow-substrait=18.1.0 + - libblas=3.9.0 + - libbrotlicommon=1.1.0 + - libbrotlidec=1.1.0 + - libbrotlienc=1.1.0 + - libcblas=3.9.0 + - libcrc32c=1.1.2 + - libcurl=8.11.1 + - libcxx=19.1.7 + - libdeflate=1.23 + - libedit=3.1.20250104 + - libev=4.33 + - libevent=2.1.12 - libffi=3.4.2 - - libsodium=1.0.18 - - libsqlite=3.40.0 - - libzlib=1.2.13 - - matplotlib-inline=0.1.6 - - ncurses=6.3 - - nest-asyncio=1.5.6 - - openssl=3.1.0 - - packaging=23.1 - - parso=0.8.3 - - pexpect=4.8.0 - - pickleshare=0.7.5 - - pip=23.1.2 - - platformdirs=3.5.0 - - prompt-toolkit=3.0.38 - - prompt_toolkit=3.0.38 - - psutil=5.9.5 - - ptyprocess=0.7.0 - - pure_eval=0.2.2 - - pygments=2.15.1 + - libgfortran=5.0.0 + - libgfortran5=13.2.0 + - libgoogle-cloud=2.34.0 + - libgoogle-cloud-storage=2.34.0 + - libgrpc=1.67.1 + - libiconv=1.17 + - libjpeg-turbo=3.0.0 + - liblapack=3.9.0 + - liblzma=5.6.4 + - liblzma-devel=5.6.4 + - libnghttp2=1.64.0 + - libopenblas=0.3.28 + - libparquet=18.1.0 + - libpng=1.6.46 + - libprotobuf=5.28.3 + - libre2-11=2024.07.02 + - libsodium=1.0.20 + - libsqlite=3.48.0 + - libssh2=1.11.1 + - libthrift=0.21.0 + - libtiff=4.7.0 + - libutf8proc=2.10.0 + - libwebp-base=1.5.0 + - libxcb=1.17.0 + - libxml2=2.13.5 + - libzlib=1.3.1 + - llvm-openmp=19.1.7 + - lz4-c=1.10.0 + - mako=1.3.9 + - markdown=3.6 + - markupsafe=3.0.2 + - matplotlib-base=3.10.0 + - mlflow=2.20.1 + - mlflow-skinny=2.20.1 + - mlflow-ui=2.20.1 + - multidict=6.1.0 + - munkres=1.1.4 + - ncurses=6.5 + - numpy=1.26.4 + - openjpeg=2.5.3 + - openssl=3.4.0 + - opentelemetry-api=1.16.0 + - opentelemetry-sdk=1.16.0 + - opentelemetry-semantic-conventions=0.37b0 + - optuna=4.2.0 + - orc=2.0.3 + - packaging=24.2 + - pandas=2.2.2 + - paramiko=3.5.1 + - pillow=11.1.0 + - pip=25.0 + - prometheus_client=0.21.1 + - prometheus_flask_exporter=0.23.1 + - propcache=0.2.1 + - protobuf=5.28.3 + - pthread-stubs=0.4 + - py4j=0.10.9.7 + - pyarrow=18.1.0 + - pyarrow-core=18.1.0 + - pyasn1=0.6.1 + - pyasn1-modules=0.4.1 + - pycparser=2.22 + - pydantic=2.10.6 + - pydantic-core=2.27.2 + - pynacl=1.5.0 + - pyopenssl=25.0.0 + - pyparsing=3.2.1 + - pysocks=1.7.1 + - pyspark=3.5.4 - python=3.10.8 - - python-dateutil=2.8.2 + - python-dateutil=2.9.0.post0 + - python-tzdata=2025.1 - python_abi=3.10 - - pyzmq=25.0.2 + - pytz=2024.2 + - pyu2f=0.1.5 + - pywin32-on-windows=0.1.0 + - pyyaml=6.0.2 + - qhull=2020.2 + - querystring_parser=1.2.4 + - re2=2024.07.02 - readline=8.2 - - setuptools=67.7.2 - - six=1.16.0 - - stack_data=0.6.2 - - tk=8.6.12 - - traitlets=5.9.0 - - typing-extensions=4.5.0 - - typing_extensions=4.5.0 - - wcwidth=0.2.6 - - wheel=0.40.0 - - xz=5.2.6 - - zeromq=4.3.4 - - zipp=3.15.0 - - pip: - - absl-py==1.4.0 - - alembic==1.10.4 - - alibi-detect==0.11.2 - - anyio==3.6.2 - - argon2-cffi==21.3.0 - - argon2-cffi-bindings==21.2.0 - - arrow==1.2.3 - - astunparse==1.6.3 - - attrs==23.1.0 - - auto-sklearn==0.15.0 - - beautifulsoup4==4.12.2 - - bleach==6.0.0 - - blinker==1.6.2 - - cachetools==5.3.0 - - catalogue==2.0.8 - - certifi==2022.12.7 - - cffi==1.15.1 - - charset-normalizer==3.1.0 - - click==8.1.3 - - cloudpickle==2.2.1 - - cmaes==0.9.1 - - colorlog==6.7.0 - - configspace==0.4.21 - - contourpy==1.0.7 - - cycler==0.11.0 - - cython==0.29.34 - - dask==2023.4.1 - - databricks-cli==0.17.6 - - defusedxml==0.7.1 - - dill==0.3.6 - - distributed==2023.4.1 - - distro==1.8.0 - - docker==6.0.1 - - emcee==3.1.4 - - entrypoints==0.4 - - evidently==0.3.1 - - fastjsonschema==2.16.3 - - filelock==3.12.0 - - flask==2.3.1 - - flatbuffers==23.3.3 - - fonttools==4.39.3 - - fqdn==1.5.1 - - fsspec==2023.4.0 - - future==0.18.3 - - gast==0.4.0 - - gitdb==4.0.10 - - gitpython==3.1.31 - - google-auth==2.17.3 - - google-auth-oauthlib==1.0.0 - - google-pasta==0.2.0 - - grpcio==1.54.0 - - gunicorn==20.1.0 - - h5py==3.8.0 - - huggingface-hub==0.14.1 - - hyperopt==0.2.7 - - idna==3.4 - - imageio==2.28.0 - - ipython-genutils==0.2.0 - - isoduration==20.11.0 - - itsdangerous==2.1.2 - - jax==0.4.8 - - jinja2==3.1.2 - - joblib==1.2.0 - - jsonpointer==2.3 - - jsonschema==4.17.3 - - jupyter-events==0.6.3 - - jupyter-server==2.5.0 - - jupyter-server-terminals==0.4.4 - - jupyterlab-pygments==0.2.2 - - keras==2.12.0 - - kiwisolver==1.4.4 - - lazy-loader==0.2 - - liac-arff==2.5.0 - - libclang==16.0.0 - - llvmlite==0.39.1 - - locket==1.0.0 - - mako==1.2.4 - - markdown==3.4.3 - - markupsafe==2.1.2 - - matplotlib==3.7.1 - - mistune==2.0.5 - - ml-dtypes==0.1.0 - - mlflow==2.3.1 - - msgpack==1.0.5 - - nbclassic==0.5.6 - - nbclient==0.7.4 - - nbconvert==7.3.1 - - nbformat==5.8.0 - - networkx==3.1 - - nltk==3.8.1 - - notebook==6.5.4 - - notebook-shim==0.2.3 - - numba==0.56.4 - - numpy==1.23.5 - - oauthlib==3.2.2 - - opencv-python==4.7.0.72 - - opt-einsum==3.3.0 - - optuna==3.1.1 - - pandas==2.0.1 - - pandocfilters==1.5.0 - - partd==1.4.0 - - patsy==0.5.3 - - pillow==9.5.0 - - plotly==5.14.1 - - prometheus-client==0.16.0 - - protobuf==4.22.3 - - py4j==0.10.9.7 - - pyarrow==11.0.0 - - pyasn1==0.5.0 - - pyasn1-modules==0.3.0 - - pycparser==2.21 - - pydantic==1.10.7 - - pyjwt==2.6.0 - - pynisher==0.6.4 - - pynndescent==0.5.10 - - pyparsing==3.0.9 - - pyrfr==0.8.3 - - pyrsistent==0.19.3 - - python-json-logger==2.0.7 - - pytz==2023.3 - - pywavelets==1.4.1 - - pyyaml==6.0 - - querystring-parser==1.2.4 - - regex==2023.3.23 - - requests==2.29.0 - - requests-oauthlib==1.3.1 - - rfc3339-validator==0.1.4 - - rfc3986-validator==0.1.1 - - rsa==4.9 - - scikit-image==0.20.0 - - scikit-learn==0.24.2 - - scipy==1.10.1 - - send2trash==1.8.2 - - smac==1.2 - - smmap==5.0.0 - - sniffio==1.3.0 - - sortedcontainers==2.4.0 - - soupsieve==2.4.1 - - sqlalchemy==2.0.11 - - sqlparse==0.4.4 - - statsmodels==0.13.5 - - tabulate==0.9.0 - - tblib==1.7.0 - - tenacity==8.2.2 - - tensorboard==2.12.2 - - tensorboard-data-server==0.7.0 - - tensorboard-plugin-wit==1.8.1 - - tensorflow-estimator==2.12.0 - - tensorflow-macos==2.12.0 - - tensorflow-metal==0.8.0 - - termcolor==2.3.0 - - terminado==0.17.1 - - threadpoolctl==3.1.0 - - tifffile==2023.4.12 - - tinycss2==1.2.1 - - tokenizers==0.13.3 - - toml==0.10.2 - - toolz==0.12.0 - - tornado==6.3.1 - - tqdm==4.65.0 - - transformers==4.28.1 - - tzdata==2023.3 - - umap-learn==0.5.3 - - uri-template==1.2.0 - - urllib3==1.26.15 - - webcolors==1.13 - - webencodings==0.5.1 - - websocket-client==1.5.1 - - werkzeug==2.3.2 - - wrapt==1.14.1 - - zict==3.0.0 -prefix: /opt/homebrew/Caskroom/miniforge/base/envs/mlewp-chapter03 + - requests=2.32.3 + - rsa=4.9 + - scikit-learn=1.6.1 + - scipy=1.15.1 + - setuptools=75.8.0 + - six=1.17.0 + - smmap=5.0.0 + - snappy=1.2.1 + - sqlalchemy=2.0.37 + - sqlparse=0.5.3 + - threadpoolctl=3.5.0 + - tk=8.6.13 + - tqdm=4.67.1 + - typing-extensions=4.12.2 + - typing_extensions=4.12.2 + - tzdata=2025a + - unicodedata2=16.0.0 + - urllib3=2.3.0 + - websocket-client=1.8.0 + - werkzeug=3.1.3 + - wheel=0.45.1 + - wrapt=1.17.2 + - xorg-libxau=1.0.12 + - xorg-libxdmcp=1.1.5 + - xz=5.6.4 + - xz-gpl-tools=5.6.4 + - xz-tools=5.6.4 + - yaml=0.2.5 + - yarl=1.18.3 + - zipp=3.21.0 + - zstandard=0.23.0 + - zstd=1.5.6 +prefix: /opt/homebrew/Caskroom/miniforge/base/envs/mlewp-chapter03-hotfix diff --git a/Chapter03/mlflow-advanced/mlflow-feature-engineering.py b/Chapter03/mlflow-advanced/mlflow-feature-engineering.py index b25fc3c..039e2f5 100644 --- a/Chapter03/mlflow-advanced/mlflow-feature-engineering.py +++ b/Chapter03/mlflow-advanced/mlflow-feature-engineering.py @@ -15,7 +15,7 @@ if __name__=="__main__": # assume you have already run 'start-mlflow-server.sh' - mlflow.set_tracking_uri("http://localhost:5000") + mlflow.set_tracking_uri("http://localhost:8000") X, y = load_wine(return_X_y=True) diff --git a/Chapter03/mlflow-advanced/mlflow.db b/Chapter03/mlflow-advanced/mlflow.db new file mode 100644 index 0000000..87140bf Binary files /dev/null and b/Chapter03/mlflow-advanced/mlflow.db differ diff --git a/Chapter03/mlflow-advanced/start-mlflow-server.sh b/Chapter03/mlflow-advanced/start-mlflow-server.sh index 822e78a..49b0ea2 100644 --- a/Chapter03/mlflow-advanced/start-mlflow-server.sh +++ b/Chapter03/mlflow-advanced/start-mlflow-server.sh @@ -1,5 +1,6 @@ mlflow server \ --backend-store-uri sqlite:///mlflow.db \ --default-artifact-root ./artifacts \ - --host 0.0.0.0 + --host 0.0.0.0 \ + --port 8000 diff --git a/Chapter03/pipelines/sklearn_pipeline.py b/Chapter03/pipelines/sklearn_pipeline.py index 479b3be..8be03dc 100644 --- a/Chapter03/pipelines/sklearn_pipeline.py +++ b/Chapter03/pipelines/sklearn_pipeline.py @@ -3,6 +3,8 @@ from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +import pandas as pd numeric_features = ['age', 'balance'] numeric_transformer = Pipeline(steps=[ @@ -22,5 +24,10 @@ clf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) +df = pd.read_csv('../../Chapter01/classifying/bank_data/bank.csv', delimiter=';', decimal=',') +X, y = df.drop('y', axis=1), df['y'].apply(lambda x: 1 if x == 'yes' else 0) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) +# You need ot get clf_pipeline.fit(X_train, y_train) +print(clf_pipeline.predict(X_test)) diff --git a/Chapter03/pipelines/sparkmllib_pipeline.py b/Chapter03/pipelines/sparkmllib_pipeline.py index 14535e8..c49e586 100644 --- a/Chapter03/pipelines/sparkmllib_pipeline.py +++ b/Chapter03/pipelines/sparkmllib_pipeline.py @@ -16,7 +16,7 @@ # Get the data and place it in a spark dataframe data = spark.read.format("csv").option("sep", ";").option("inferSchema", "true").option("header", "true").load( - "../../chapter1/stream-classifier/data/bank/bank.csv") + "../Chapter01/classifying/bank_data/bank.csv") # map target to numerical category data = data.withColumn('label', f.when((f.col("y") == "yes"), 1).otherwise(0)) @@ -68,3 +68,5 @@ # Define the entire pipeline and fit on the train data and transform on the test data clfPipeline = Pipeline().setStages(stages).fit(trainingData) clfPipeline.transform(testData) + + print(clfPipeline.transform(testData).show()) diff --git a/Chapter05/mlewp2-airflow/aws-mwaa-local-runner b/Chapter05/mlewp2-airflow/aws-mwaa-local-runner new file mode 160000 index 0000000..2e40031 --- /dev/null +++ b/Chapter05/mlewp2-airflow/aws-mwaa-local-runner @@ -0,0 +1 @@ +Subproject commit 2e4003132892e7ffaef4a1071369899d7c3d8456