diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f19206b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.git.zip +.DS_Store diff --git a/0.1_official_module_with_invoker.zip b/0.1_official_module_with_invoker.zip new file mode 100644 index 0000000..1db1532 Binary files /dev/null and b/0.1_official_module_with_invoker.zip differ diff --git a/0.1_official_module_with_invoker/clean_missing_data.yaml b/0.1_official_module_with_invoker/clean_missing_data.yaml new file mode 100644 index 0000000..26f145a --- /dev/null +++ b/0.1_official_module_with_invoker/clean_missing_data.yaml @@ -0,0 +1,133 @@ +moduleIdentifier: + namespace: zhizhu.com/test + moduleName: New Clean Missing Data + moduleVersion: 0.0.116.1 +isDeterministic: true +category: Data Transformation +description: Specifies how to handle the values missing from a dataset. +releaseState: Release +inputs: +- name: Dataset + type: DataFrameDirectory + port: true + description: Dataset to be cleaned +- name: Columns to be cleaned + type: ColumnPicker + description: Columns for missing values clean operation + columnPickerFor: Dataset +- name: Minimum missing value ratio + type: Float + description: Clean only column with missing value ratio above specified value, out + of set of all selected columns + default: 0.0 + min: 0.0 + max: 1.0 +- name: Maximum missing value ratio + type: Float + default: 1.0 + description: Clean only columns with missing value ratio below specified value, + out of set of all selected columns + min: 0.0 + max: 1.0 +- name: Cleaning mode + type: Mode + default: Custom substitution value + description: Algorithm to clean missing values + options: + - Custom substitution value: + - name: Replacement value + type: String + default: '0' + optional: true + description: Type the value that takes the place of missing values + - name: Generate missing value indicator column + type: Boolean + description: Generate a column that indicates which rows were cleaned + - Replace with mean: + - name: Cols with all missing values + type: Mode + default: Remove + description: Cols with all missing values + options: + - Propagate + - Remove + - name: Generate missing value indicator column + type: Boolean + description: Generate a column that indicates which rows were cleaned + - Replace with median: + - name: Cols with all missing values + type: Mode + default: Remove + description: Cols with all missing values + options: + - Propagate + - Remove + - name: Generate missing value indicator column + type: Boolean + description: Generate a column that indicates which rows were cleaned + - Replace with mode: + - name: Cols with all missing values + type: Mode + default: Remove + description: Cols with all missing values + options: + - Propagate + - Remove + - name: Generate missing value indicator column + type: Boolean + description: Generate a column that indicates which rows were cleaned + - Remove entire row + - Remove entire column +outputs: +- name: Cleaned dataset + type: DataFrameDirectory + description: Cleaned dataset +- name: Cleaning transformation + type: TransformationDirectory + description: Transformation to be passed to Apply Transformation module to clean + new data +implementation: + container: + runConfig: + baseDockerImage: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04 + gpuSupport: false + conda: + name: project_environment + channels: + - defaults + dependencies: + - python=3.6.8 + - pip: + - azureml-designer-classic-modules==0.0.116 + command: + - python + - invoker.py + - python + - -m + - azureml.studio.modulehost.module_invoker + - --module-name=azureml.studio.modules.datatransform.clean_missing_data.clean_missing_data + args: + - --dataset + - inputPath: Dataset + - --columns-to-be-cleaned + - inputValue: Columns to be cleaned + - --minimum-missing-value-ratio + - inputValue: Minimum missing value ratio + - --maximum-missing-value-ratio + - inputValue: Maximum missing value ratio + - --cleaning-mode + - inputValue: Cleaning mode + - - --replacement-value + - inputValue: Replacement value + - - --cols-with-all-missing-values + - inputValue: Cols with all missing values + - - --generate-missing-value-indicator-column + - inputValue: Generate missing value indicator column + - --cleaned-dataset + - outputPath: Cleaned dataset + - --cleaning-transformation + - outputPath: Cleaning transformation + invoking: + module: azureml.studio.modules.datatransform.clean_missing_data.clean_missing_data + class: CleanMissingDataModule + func: run diff --git a/0.1_official_module_with_invoker/invoker.py b/0.1_official_module_with_invoker/invoker.py new file mode 100644 index 0000000..4eba60d --- /dev/null +++ b/0.1_official_module_with_invoker/invoker.py @@ -0,0 +1,36 @@ +import subprocess +import sys + + +def run(command: list, timeout=60000): + if not command: + return + + return subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr).wait(timeout=timeout) + + +INVOKER_VERSION = '0.0.6' + + +def is_invoking_official_module(args): + return len(args) >= 3 and args[0] == 'python' and args[1] == '-m' and args[2].startswith('azureml.studio.') + + +def generate_run_command(args): + return [arg for arg in args] + + +def execute(args): + is_custom_module = not is_invoking_official_module(args) + module_type = 'custom module' if is_custom_module else 'official module' + print('Invoking {} by invoker {}.'.format(module_type, INVOKER_VERSION)) + + ret = run(generate_run_command(args)) + + # set the subprocess run result as exit value + exit(ret) + + +if __name__ == '__main__': + args = sys.argv[1:] + execute(args) diff --git a/0_official_module.zip b/0_official_module.zip new file mode 100644 index 0000000..096ffc7 Binary files /dev/null and b/0_official_module.zip differ diff --git a/0_official_module/clean_missing_data.yaml b/0_official_module/clean_missing_data.yaml new file mode 100644 index 0000000..67c9dab --- /dev/null +++ b/0_official_module/clean_missing_data.yaml @@ -0,0 +1,131 @@ +moduleIdentifier: + namespace: zhizhu.com/test + moduleName: New Clean Missing Data + moduleVersion: 0.0.116 +isDeterministic: true +category: Data Transformation +description: Specifies how to handle the values missing from a dataset. +releaseState: Release +inputs: +- name: Dataset + type: DataFrameDirectory + port: true + description: Dataset to be cleaned +- name: Columns to be cleaned + type: ColumnPicker + description: Columns for missing values clean operation + columnPickerFor: Dataset +- name: Minimum missing value ratio + type: Float + description: Clean only column with missing value ratio above specified value, out + of set of all selected columns + default: 0.0 + min: 0.0 + max: 1.0 +- name: Maximum missing value ratio + type: Float + default: 1.0 + description: Clean only columns with missing value ratio below specified value, + out of set of all selected columns + min: 0.0 + max: 1.0 +- name: Cleaning mode + type: Mode + default: Custom substitution value + description: Algorithm to clean missing values + options: + - Custom substitution value: + - name: Replacement value + type: String + default: '0' + optional: true + description: Type the value that takes the place of missing values + - name: Generate missing value indicator column + type: Boolean + description: Generate a column that indicates which rows were cleaned + - Replace with mean: + - name: Cols with all missing values + type: Mode + default: Remove + description: Cols with all missing values + options: + - Propagate + - Remove + - name: Generate missing value indicator column + type: Boolean + description: Generate a column that indicates which rows were cleaned + - Replace with median: + - name: Cols with all missing values + type: Mode + default: Remove + description: Cols with all missing values + options: + - Propagate + - Remove + - name: Generate missing value indicator column + type: Boolean + description: Generate a column that indicates which rows were cleaned + - Replace with mode: + - name: Cols with all missing values + type: Mode + default: Remove + description: Cols with all missing values + options: + - Propagate + - Remove + - name: Generate missing value indicator column + type: Boolean + description: Generate a column that indicates which rows were cleaned + - Remove entire row + - Remove entire column +outputs: +- name: Cleaned dataset + type: DataFrameDirectory + description: Cleaned dataset +- name: Cleaning transformation + type: TransformationDirectory + description: Transformation to be passed to Apply Transformation module to clean + new data +implementation: + container: + runConfig: + baseDockerImage: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04 + gpuSupport: false + conda: + name: project_environment + channels: + - defaults + dependencies: + - python=3.6.8 + - pip: + - azureml-designer-classic-modules==0.0.116 + command: + - python + - -m + - azureml.studio.modulehost.module_invoker + - --module-name=azureml.studio.modules.datatransform.clean_missing_data.clean_missing_data + args: + - --dataset + - inputPath: Dataset + - --columns-to-be-cleaned + - inputValue: Columns to be cleaned + - --minimum-missing-value-ratio + - inputValue: Minimum missing value ratio + - --maximum-missing-value-ratio + - inputValue: Maximum missing value ratio + - --cleaning-mode + - inputValue: Cleaning mode + - - --replacement-value + - inputValue: Replacement value + - - --cols-with-all-missing-values + - inputValue: Cols with all missing values + - - --generate-missing-value-indicator-column + - inputValue: Generate missing value indicator column + - --cleaned-dataset + - outputPath: Cleaned dataset + - --cleaning-transformation + - outputPath: Cleaning transformation + invoking: + module: azureml.studio.modules.datatransform.clean_missing_data.clean_missing_data + class: CleanMissingDataModule + func: run diff --git a/0_official_module/invoker.py b/0_official_module/invoker.py new file mode 100644 index 0000000..4eba60d --- /dev/null +++ b/0_official_module/invoker.py @@ -0,0 +1,36 @@ +import subprocess +import sys + + +def run(command: list, timeout=60000): + if not command: + return + + return subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr).wait(timeout=timeout) + + +INVOKER_VERSION = '0.0.6' + + +def is_invoking_official_module(args): + return len(args) >= 3 and args[0] == 'python' and args[1] == '-m' and args[2].startswith('azureml.studio.') + + +def generate_run_command(args): + return [arg for arg in args] + + +def execute(args): + is_custom_module = not is_invoking_official_module(args) + module_type = 'custom module' if is_custom_module else 'official module' + print('Invoking {} by invoker {}.'.format(module_type, INVOKER_VERSION)) + + ret = run(generate_run_command(args)) + + # set the subprocess run result as exit value + exit(ret) + + +if __name__ == '__main__': + args = sys.argv[1:] + execute(args) diff --git a/10_invalid_yaml.zip b/10_invalid_yaml.zip new file mode 100644 index 0000000..dd852bc Binary files /dev/null and b/10_invalid_yaml.zip differ diff --git a/10_invalid_yaml/invalid.yaml b/10_invalid_yaml/invalid.yaml new file mode 100644 index 0000000..8507427 --- /dev/null +++ b/10_invalid_yaml/invalid.yaml @@ -0,0 +1 @@ +hello: diff --git a/10_invalid_yaml/invalid2.yaml b/10_invalid_yaml/invalid2.yaml new file mode 100644 index 0000000..843842c --- /dev/null +++ b/10_invalid_yaml/invalid2.yaml @@ -0,0 +1 @@ +incomplete_list: [ diff --git a/10_invalid_yaml/invalid3.yaml b/10_invalid_yaml/invalid3.yaml new file mode 100644 index 0000000..7cc86ad --- /dev/null +++ b/10_invalid_yaml/invalid3.yaml @@ -0,0 +1 @@ +666 diff --git a/10_invalid_yaml/invalid4.yaml b/10_invalid_yaml/invalid4.yaml new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/10_invalid_yaml/invalid4.yaml @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/10_invalid_yaml/merge.py b/10_invalid_yaml/merge.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/10_invalid_yaml/merge.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/10_invalid_yaml/module_spec.yaml b/10_invalid_yaml/module_spec.yaml new file mode 100644 index 0000000..f1d90bb --- /dev/null +++ b/10_invalid_yaml/module_spec.yaml @@ -0,0 +1,234 @@ +moduleIdentifier: + namespace: microsoft.com/office + moduleName: [AE365][SmartReply][AML][Test] PyTorch Distributed + moduleVersion: 0.0.2.0309172714 +type: Mpi +metadata: + annotations: + familyId: 439f6c66-ab56-4798-9bab-ed3154c71c3d + labels: + - Office + - AE365 + - SmartReply + - AML + - Test +inputs: +- name: train_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: valid_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: gmr_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: vocab_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: rsp_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: pretrained_model_path + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: model_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: architecture + type: Enum + default: bert_matching_model + options: + - matching_model + - bert_matching_model +- name: pretrained_model_file + type: Enum + default: bert_encoder_epoch_500.pt + options: + - bert_encoder_epoch_500.pt + optional: true +- name: run_mode + type: Enum + default: train + options: + - train + - eval + - export +- name: load_from + type: Enum + default: tnlr + options: + - bert + - tnlr + optional: true +- name: max_epochs + type: Integer + default: 5 + optional: true +- name: lm_alpha + type: Float + default: 1.4 + optional: true +- name: batch_size + type: Integer + default: 256 + optional: true +- name: batch_size_infer + type: Integer + default: 256 + optional: true +- name: learning_rate + type: Float + default: 3e-4 + optional: true +- name: max_msg_len + type: Integer + default: 64 + optional: true +- name: max_rsp_len + type: Integer + default: 64 + optional: true +- name: decay_step + type: Integer + default: 2000 + optional: true +- name: decay_rate + type: Float + default: 0.99 + optional: true +- name: warmup_proportion + type: Float + default: 0.0002 + optional: true +- name: loss_scale + type: Integer + default: 0 + optional: true +- name: tokenizer + type: Enum + default: wordpiece + options: + - wordpiece + - sentencepiece + optional: true +- name: gradient_accumulation_steps + type: Integer + default: 1 + optional: true +- name: optimizer + type: Enum + default: adam + options: + - adam + - adadelta + optional: true +- name: validation_freq + type: Integer + default: -1 + optional: true +- name: save_freq + type: Integer + default: -1 + optional: true +- name: manual_seed + type: Integer + default: 42 + optional: true +- name: infer_batches + type: Integer + default: 1000 + optional: true +outputs: +- name: model_output_dir + type: AzureEncryptedBlobReference +- name: eval_output_dir + type: AzureEncryptedBlobReference +runConfig: + nodeCount: + type: Integer + default: 4 + min: 1 + processCountPerNode: + type: Integer + default: 2 +implementation: + container: + image: mcr.microsoft.com/azureml/bert:pretrain-openmpi3.1.2-cuda10.0-cudnn7-ubuntu16.04 + command: [/opt/miniconda/envs/amlbert/bin/python, smartreply/models/matching/driver.py] + args: + - --train_input_dir + - inputPath: train_input_dir + - --valid_input_dir + - inputPath: valid_input_dir + - --gmr_input_dir + - inputPath: gmr_input_dir + - --vocab_input_dir + - inputPath: vocab_input_dir + - --rsp_input_dir + - inputPath: rsp_input_dir + - --pretrained_model_path + - inputPath: pretrained_model_path + - --model_input_dir + - inputPath: model_input_dir + - --model_output_dir + - outputPath: model_output_dir + - --eval_output_dir + - outputPath: eval_output_dir + - --architecture + - inputValue: architecture + - --pretrained_model_file + - inputValue: pretrained_model_file + - --run_mode + - inputValue: run_mode + - --load_from + - inputValue: load_from + - --max_epochs + - inputValue: max_epochs + - --lm_alpha + - inputValue: lm_alpha + - --batch_size + - inputValue: batch_size + - --batch_size_infer + - inputValue: batch_size_infer + - --learning_rate + - inputValue: learning_rate + - --max_msg_len + - inputValue: max_msg_len + - --max_rsp_len + - inputValue: max_rsp_len + - --decay_step + - inputValue: decay_step + - --decay_rate + - inputValue: decay_rate + - --warmup_proportion + - inputValue: warmup_proportion + - --loss_scale + - inputValue: loss_scale + - --tokenizer + - inputValue: tokenizer + - --gradient_accumulation_steps + - inputValue: gradient_accumulation_steps + - --optimizer + - inputValue: optimizer + - --validation_freq + - inputValue: validation_freq + - --save_freq + - inputValue: save_freq + - --manual_seed + - inputValue: manual_seed + - --infer_batches + - inputValue: infer_batches diff --git a/1_basic_kfc.zip b/1_basic_kfc.zip new file mode 100644 index 0000000..dbeb342 Binary files /dev/null and b/1_basic_kfc.zip differ diff --git a/1_basic_kfc/kubeflow.yaml b/1_basic_kfc/kubeflow.yaml new file mode 100644 index 0000000..5874544 --- /dev/null +++ b/1_basic_kfc/kubeflow.yaml @@ -0,0 +1,22 @@ +moduleIdentifier: + namespace: zhizhu.com/test + moduleName: xgboost4j - Train classifier + moduleVersion: 0.0.1 +description: Trains a boosted tree ensemble classifier using xgboost4j + +inputs: +- {name: Training data, type: DataFrameDirectory} +- {name: Rounds, type: Integer, default: '30', help: Number of training rounds} + +outputs: +- {name: Trained model, type: XGBoost model, help: Trained XGBoost model} + +implementation: + container: + image: gcr.io/ml-pipeline/xgboost-classifier-train@sha256:b3a64d57 + command: [ + /ml/train.py, + --train-set, {inputPath: Training data}, + --rounds, {inputValue: Rounds}, + --out-model, {outputPath: Trained model}, + ] diff --git a/1_basic_kfc/ml/train.py b/1_basic_kfc/ml/train.py new file mode 100644 index 0000000..4eba60d --- /dev/null +++ b/1_basic_kfc/ml/train.py @@ -0,0 +1,36 @@ +import subprocess +import sys + + +def run(command: list, timeout=60000): + if not command: + return + + return subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr).wait(timeout=timeout) + + +INVOKER_VERSION = '0.0.6' + + +def is_invoking_official_module(args): + return len(args) >= 3 and args[0] == 'python' and args[1] == '-m' and args[2].startswith('azureml.studio.') + + +def generate_run_command(args): + return [arg for arg in args] + + +def execute(args): + is_custom_module = not is_invoking_official_module(args) + module_type = 'custom module' if is_custom_module else 'official module' + print('Invoking {} by invoker {}.'.format(module_type, INVOKER_VERSION)) + + ret = run(generate_run_command(args)) + + # set the subprocess run result as exit value + exit(ret) + + +if __name__ == '__main__': + args = sys.argv[1:] + execute(args) diff --git a/2.2_eselect_fix_port_type.zip b/2.2_eselect_fix_port_type.zip new file mode 100644 index 0000000..3ce7436 Binary files /dev/null and b/2.2_eselect_fix_port_type.zip differ diff --git a/2.2_eselect_fix_port_type/eselect.yaml b/2.2_eselect_fix_port_type/eselect.yaml new file mode 100644 index 0000000..5345eb9 --- /dev/null +++ b/2.2_eselect_fix_port_type/eselect.yaml @@ -0,0 +1,25 @@ +moduleIdentifier: + namespace: example.com/modules + moduleName: eselect + moduleVersion: 0.0.2 +description: Selects columns from input file based on the column description in the first line. Similar to cut (and grep), but column names can be used. +inputs: +- name: columns + type: String +- name: input + type: AnyDirectory +outputs: +- name: output + type: AnyFile +implementation: + container: + image: ttthree/modules:latest + command: [python, /home/invoker.py] + args: + - dotnet + - /home/etools/eselect.dll + - -f + - inputValue: columns + - inputPath: input + - outputPath: output + diff --git a/2.3_eselect_commands_with_placeholders.zip b/2.3_eselect_commands_with_placeholders.zip new file mode 100644 index 0000000..f6125d2 Binary files /dev/null and b/2.3_eselect_commands_with_placeholders.zip differ diff --git a/2.3_eselect_commands_with_placeholders/module_spec.yaml b/2.3_eselect_commands_with_placeholders/module_spec.yaml new file mode 100644 index 0000000..7b10212 --- /dev/null +++ b/2.3_eselect_commands_with_placeholders/module_spec.yaml @@ -0,0 +1,17 @@ +moduleIdentifier: + namespace: example.com/modules + moduleName: eselect + moduleVersion: 0.0.1 +description: Selects columns from input file based on the column description in the first line. Similar to cut (and grep), but column names can be used. +inputs: + - {name: columns, type: String, description: 'column name list with ; as delimeter'} + - {name: input, type: AnyFile, description: 'the input file path, only TSV supported'} +outputs: + - {name: output, type: AnyFile, description: 'the output file path'} +implementation: + container: + image: ttthree/modules:latest + command: [ + python, /home/invoker.py, dotnet, /home/etools/eselect.dll, -f, {inputValue: columns}, {inputPath: input}, {outputPath: output} + ] + diff --git a/2_eselect.zip b/2_eselect.zip new file mode 100644 index 0000000..eca5f07 Binary files /dev/null and b/2_eselect.zip differ diff --git a/2_eselect/eselect.yaml b/2_eselect/eselect.yaml new file mode 100644 index 0000000..1cb776d --- /dev/null +++ b/2_eselect/eselect.yaml @@ -0,0 +1,24 @@ +moduleIdentifier: + moduleName: eselect + moduleVersion: 0.0.1 +description: Selects columns from input file based on the column description in the first line. Similar to cut (and grep), but column names can be used. +inputs: +- name: columns + type: String +- name: input + type: String +outputs: +- name: output + type: AnyFile +implementation: + container: + image: ttthree/modules:latest + command: [python, /home/invoker.py] + args: + - dotnet + - /home/etools/eselect.dll + - -f + - inputValue: columns + - inputPath: input + - outputPath: output + diff --git a/3_basic_module.zip b/3_basic_module.zip new file mode 100644 index 0000000..7d064e1 Binary files /dev/null and b/3_basic_module.zip differ diff --git a/3_basic_module/basic_module.yaml b/3_basic_module/basic_module.yaml new file mode 100644 index 0000000..799c73d --- /dev/null +++ b/3_basic_module/basic_module.yaml @@ -0,0 +1,35 @@ +moduleIdentifier: + namespace: example.com/modules + moduleName: Basic Module + moduleVersion: 0.0.2 +description: | + Basic module for demo. + No `type` specified in spec and will defaults to 'Basic'. + Demos how to specify an environment from a pre-built docker image. +metadata: + properties: + tags: [nlp, bert] + contact: AzureML Studio Team + helpDocument: http://readthedocs.com/proj +inputs: + - name: Input Port + type: DataFrameDirectory + - name: Parameter 1 + type: String + default: hello + - name: Parameter 2 + type: Integer + default: 1 +outputs: + - name: Output Port + type: DataFrameDirectory +implementation: + container: + image: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04 + command: [python, module_entry.py] + args: [ + --input, {inputPath: Input Port }, + --parameter-1, { inputValue: Parameter 1 }, + --parameter-2, { inputValue: Parameter 2 }, + --output, { outputPath: Output Port }, + ] diff --git a/3_basic_module/module_entry.py b/3_basic_module/module_entry.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/3_basic_module/module_entry.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/4_mpi_module.zip b/4_mpi_module.zip new file mode 100644 index 0000000..838ca29 Binary files /dev/null and b/4_mpi_module.zip differ diff --git a/4_mpi_module/module_entry.py b/4_mpi_module/module_entry.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/4_mpi_module/module_entry.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/4_mpi_module/mpi_module.yaml b/4_mpi_module/mpi_module.yaml new file mode 100644 index 0000000..6055327 --- /dev/null +++ b/4_mpi_module/mpi_module.yaml @@ -0,0 +1,63 @@ +moduleIdentifier: + namespace: microsoft.com/office + moduleName: Mpi Module + moduleVersion: 0.0.1 +type: Mpi +description: Mpi module for demo. +metadata: + properties: + tags: [nlp, bert] + contact: AzureML Studio Team + helpDocument: http://readthedocs.com/proj +inputs: + - name: Input Port + type: [AnyFile, AnyDirectory] + - name: Parameter 1 + type: String + default: hello + description: Input a greeting message. + - name: Parameter 2 + type: Enum + default: Red + options: [Red, Green, Blue] + description: Choose your favorite color. + - name: Parameter 3 + type: Integer + default: 1 + min: 0 + max: 10 + description: The Integer parameter which has a range validation. +outputs: + - name: Output Port + type: AnyDirectory +runConfig: + nodeCount: + type: Integer + default: 4 + min: 1 + max: 10 + processCountPerNode: + type: Integer + default: 2 +implementation: + container: + amlEnvironment: + docker: + baseImage: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04 + python: + condaDenpendencies: + name: project_environment + channels: + - defaults + dependencies: + - python=3.6.8 + - pip: + - azureml-designer-classic-modules==0.0.116 + command: [python, module_entry.py] + args: [ + --input, {inputPath: Input Port }, + --parameter-1, { inputValue: Parameter 1 }, + --parameter-2, { inputValue: Parameter 2 }, + --parameter-3, { inputValue: Parameter 3 }, + --output, { outputPath: Output Port }, + ] diff --git a/5_mpi_module_using_env.zip b/5_mpi_module_using_env.zip new file mode 100644 index 0000000..0eef7a2 Binary files /dev/null and b/5_mpi_module_using_env.zip differ diff --git a/5_mpi_module_using_env/module_entry.py b/5_mpi_module_using_env/module_entry.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/5_mpi_module_using_env/module_entry.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/5_mpi_module_using_env/mpi_module.yaml b/5_mpi_module_using_env/mpi_module.yaml new file mode 100644 index 0000000..c630667 --- /dev/null +++ b/5_mpi_module_using_env/mpi_module.yaml @@ -0,0 +1,62 @@ +moduleIdentifier: + namespace: microsoft.com/office + moduleName: Mpi Module + moduleVersion: 0.0.1 +type: Mpi +description: | + Mpi module for demo. + To create an MPI module: + * Set `type` to 'Mpi'. + * Optional: add `runConfig` to specify the specification of run-level configurations. + This module also demos: + * Input port which supports multiple data types. + * Specification of Integer / Mode parameters. + * How to use a pre-registered environment of the worksapace. +metadata: + properties: + tags: [nlp, bert] + contact: AzureML Studio Team + helpDocument: http://readthedocs.com/proj +inputs: + - name: Input Port + type: [DataFrameDirectory, ImageDirectory] + - name: Parameter 1 + type: String + default: hello + description: Input a greeting message. + - name: Parameter 2 + type: Enum + default: Red + options: [Red, Green, Blue] + description: Choose your favorite color. + - name: Parameter 3 + type: Integer + default: 1 + min: 0 + max: 10 + description: The Integer parameter which has a range validation. +outputs: + - name: Output Port + type: DataFrameDirectory +runConfig: + nodeCount: + type: Integer + default: 4 + min: 1 + max: 10 + processCountPerNode: + type: Integer + default: 2 +implementation: + container: + amlEnvironment: + name: DemoEnv + version: 0.0.1 + command: [python, module_entry.py] + args: [ + --input, {inputPath: Input Port }, + --parameter-1, { inputValue: Parameter 1 }, + --parameter-2, { inputValue: Parameter 2 }, + --parameter-3, { inputValue: Parameter 3 }, + --output, { outputPath: Output Port }, + ] diff --git a/6_hdi_module.zip b/6_hdi_module.zip new file mode 100644 index 0000000..1f04727 Binary files /dev/null and b/6_hdi_module.zip differ diff --git a/6_hdi_module/hdi_module.yaml b/6_hdi_module/hdi_module.yaml new file mode 100644 index 0000000..c0eae11 --- /dev/null +++ b/6_hdi_module/hdi_module.yaml @@ -0,0 +1,34 @@ +moduleIdentifier: + namespace: microsoft.com/office/demo + moduleName: HDInsight Module + moduleVersion: 0.0.1 +type: HDInsight +description: | + HDInsight module for demo. +metadata: + properties: + tags: [nlp, bert] + contact: AzureML Studio Team + helpDocument: http://readthedocs.com/proj +inputs: + - name: Input Port + type: DataFrameDirectory + - name: Parameter 1 + type: String + default: hello + - name: Parameter 2 + type: Integer + default: 1 +outputs: + - name: Output Port + type: DataFrameDirectory +implementation: + container: + image: office/module_runner_docker + command: [python, module_entry.py] + args: [ + --input, {inputPath: Input Port }, + --parameter-1, { inputValue: Parameter 1 }, + --parameter-2, { inputValue: Parameter 2 }, + --output, { outputPath: Output Port }, + ] diff --git a/6_hdi_module/module_entry.py b/6_hdi_module/module_entry.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/6_hdi_module/module_entry.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/7.1_kfc_merge_files_add_aml_identifier.zip b/7.1_kfc_merge_files_add_aml_identifier.zip new file mode 100644 index 0000000..4ac60d8 Binary files /dev/null and b/7.1_kfc_merge_files_add_aml_identifier.zip differ diff --git a/7.1_kfc_merge_files_add_aml_identifier/kfc.yaml b/7.1_kfc_merge_files_add_aml_identifier/kfc.yaml new file mode 100644 index 0000000..c71c9b2 --- /dev/null +++ b/7.1_kfc_merge_files_add_aml_identifier/kfc.yaml @@ -0,0 +1,22 @@ +moduleIdentifier: + moduleName: Merge every n files together cdd3b0ba + moduleVersion: 0.0.1 + namespace: test/test +inputs: +- name: InputFolder + type: LocalPath +- name: merge_count + type: Integer + default: '100' + optional: true +outputs: +- name: OutputFolder + type: LocalPath +implementation: + container: + image: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04 + command: [python, merge.py] + args: + - inputPath: InputFolder + - outputPath: OutputFolder + - inputValue: merge_count diff --git a/7.1_kfc_merge_files_add_aml_identifier/merge.py b/7.1_kfc_merge_files_add_aml_identifier/merge.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/7.1_kfc_merge_files_add_aml_identifier/merge.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/7.2_kfc_merge_files_datatype_has_dicts.zip b/7.2_kfc_merge_files_datatype_has_dicts.zip new file mode 100644 index 0000000..d287ef4 Binary files /dev/null and b/7.2_kfc_merge_files_datatype_has_dicts.zip differ diff --git a/7.2_kfc_merge_files_datatype_has_dicts/kfc.yaml b/7.2_kfc_merge_files_datatype_has_dicts/kfc.yaml new file mode 100644 index 0000000..9ff41a9 --- /dev/null +++ b/7.2_kfc_merge_files_datatype_has_dicts/kfc.yaml @@ -0,0 +1,60 @@ +moduleIdentifier: + moduleName: Merge every n files together cdd3b0ba + moduleVersion: 0.0.1 + namespace: test/test +inputs: +- name: InputFolder with single simple type + type: String +- name: InputFolder with single dict type + type: + LocalPath: + openapi_schema_validator: + type: string + pattern: "^file://.*$" +- name: InputFolder with multiple dict types + type: + - LocalPath: + openapi_schema_validator: + type: string + pattern: "^file://.*$" + - RemotePath: + openapi_schema_validator: + type: string + pattern: "^http://.*$" +- name: InputFolder with merged type + type: + - LocalPath: + openapi_schema_validator: + type: string + pattern: "^file://.*$" + - String +- name: InputFolder with multiple simple types + type: + - DateTime + - String +- name: merge_count + type: Integer + default: '100' + optional: true +outputs: +- name: OutputFolder with dict type + type: + LocalPath: + openapi_schema_validator: + type: string + pattern: "^file://.*$" +- name: OutputFolder with simple type + type: String +implementation: + container: + image: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04 + command: [python, merge.py] + args: + - inputPath: InputFolder with single simple type + - inputPath: InputFolder with single dict type + - inputPath: InputFolder with multiple dict types + - inputPath: InputFolder with merged type + - inputPath: InputFolder with multiple simple types + - outputPath: OutputFolder with dict type + - outputPath: OutputFolder with simple type + - inputValue: merge_count diff --git a/7.2_kfc_merge_files_datatype_has_dicts/merge.py b/7.2_kfc_merge_files_datatype_has_dicts/merge.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/7.2_kfc_merge_files_datatype_has_dicts/merge.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/7_kfc_merge_files.zip b/7_kfc_merge_files.zip new file mode 100644 index 0000000..8678b00 Binary files /dev/null and b/7_kfc_merge_files.zip differ diff --git a/7_kfc_merge_files/kfc.yaml b/7_kfc_merge_files/kfc.yaml new file mode 100644 index 0000000..c71c9b2 --- /dev/null +++ b/7_kfc_merge_files/kfc.yaml @@ -0,0 +1,22 @@ +moduleIdentifier: + moduleName: Merge every n files together cdd3b0ba + moduleVersion: 0.0.1 + namespace: test/test +inputs: +- name: InputFolder + type: LocalPath +- name: merge_count + type: Integer + default: '100' + optional: true +outputs: +- name: OutputFolder + type: LocalPath +implementation: + container: + image: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04 + command: [python, merge.py] + args: + - inputPath: InputFolder + - outputPath: OutputFolder + - inputValue: merge_count diff --git a/7_kfc_merge_files/merge.py b/7_kfc_merge_files/merge.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/7_kfc_merge_files/merge.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/8_merge_files.zip b/8_merge_files.zip new file mode 100644 index 0000000..9beee5f Binary files /dev/null and b/8_merge_files.zip differ diff --git a/8_merge_files/merge.py b/8_merge_files/merge.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/8_merge_files/merge.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/8_merge_files/module_spec.yaml b/8_merge_files/module_spec.yaml new file mode 100644 index 0000000..77ec9b5 --- /dev/null +++ b/8_merge_files/module_spec.yaml @@ -0,0 +1,27 @@ +moduleIdentifier: + namespace: microsoft.com/office + moduleName: Merge every n files together + moduleVersion: 0.0.2 +metadata: + annotations: + familyId: cdd3b0ba-a1a1-4a70-bb71-88b9d59904b7 + tags: + - Office +inputs: +- name: InputFolder + type: [AzureEncryptedBlobReference, AnyDirectory] +- name: merge_count + type: Integer + default: 100 + optional: true +outputs: +- name: OutputFolder + type: AzureEncryptedBlobReference +implementation: + container: + image: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04 + command: [python, merge.py] + args: + - inputPath: InputFolder + - outputPath: OutputFolder + - inputValue: merge_count diff --git a/9_smart_reply_pytorch.zip b/9_smart_reply_pytorch.zip new file mode 100644 index 0000000..da29b66 Binary files /dev/null and b/9_smart_reply_pytorch.zip differ diff --git a/9_smart_reply_pytorch/merge.py b/9_smart_reply_pytorch/merge.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/9_smart_reply_pytorch/merge.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/9_smart_reply_pytorch/module_spec.yaml b/9_smart_reply_pytorch/module_spec.yaml new file mode 100644 index 0000000..b50c22c --- /dev/null +++ b/9_smart_reply_pytorch/module_spec.yaml @@ -0,0 +1,234 @@ +moduleIdentifier: + namespace: microsoft.com/office + moduleName: "[AE365][SmartReply][AML][Test] PyTorch Distributed" + moduleVersion: 0.0.2-a0309172714 +type: Mpi +metadata: + annotations: + familyId: 439f6c66-ab56-4798-9bab-ed3154c71c3d + tags: + - Office + - AE365 + - SmartReply + - AML + - Test +inputs: +- name: train_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: valid_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: gmr_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: vocab_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: rsp_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: pretrained_model_path + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: model_input_dir + type: + - AzureEncryptedBlobReference + - AnyDirectory + optional: true +- name: architecture + type: Enum + default: bert_matching_model + options: + - matching_model + - bert_matching_model +- name: pretrained_model_file + type: Enum + default: bert_encoder_epoch_500.pt + options: + - bert_encoder_epoch_500.pt + optional: true +- name: run_mode + type: Enum + default: train + options: + - train + - eval + - export +- name: load_from + type: Enum + default: tnlr + options: + - bert + - tnlr + optional: true +- name: max_epochs + type: Integer + default: 5 + optional: true +- name: lm_alpha + type: Float + default: 1.4 + optional: true +- name: batch_size + type: Integer + default: 256 + optional: true +- name: batch_size_infer + type: Integer + default: 256 + optional: true +- name: learning_rate + type: Float + default: 3e-4 + optional: true +- name: max_msg_len + type: Integer + default: 64 + optional: true +- name: max_rsp_len + type: Integer + default: 64 + optional: true +- name: decay_step + type: Integer + default: 2000 + optional: true +- name: decay_rate + type: Float + default: 0.99 + optional: true +- name: warmup_proportion + type: Float + default: 0.0002 + optional: true +- name: loss_scale + type: Integer + default: 0 + optional: true +- name: tokenizer + type: Enum + default: wordpiece + options: + - wordpiece + - sentencepiece + optional: true +- name: gradient_accumulation_steps + type: Integer + default: 1 + optional: true +- name: optimizer + type: Enum + default: adam + options: + - adam + - adadelta + optional: true +- name: validation_freq + type: Integer + default: -1 + optional: true +- name: save_freq + type: Integer + default: -1 + optional: true +- name: manual_seed + type: Integer + default: 42 + optional: true +- name: infer_batches + type: Integer + default: 1000 + optional: true +outputs: +- name: model_output_dir + type: AzureEncryptedBlobReference +- name: eval_output_dir + type: AzureEncryptedBlobReference +runConfig: + nodeCount: + type: Integer + default: 4 + min: 1 + processCountPerNode: + type: Integer + default: 2 +implementation: + container: + image: mcr.microsoft.com/azureml/bert:pretrain-openmpi3.1.2-cuda10.0-cudnn7-ubuntu16.04 + command: [/opt/miniconda/envs/amlbert/bin/python, smartreply/models/matching/driver.py] + args: + - --train_input_dir + - inputPath: train_input_dir + - --valid_input_dir + - inputPath: valid_input_dir + - --gmr_input_dir + - inputPath: gmr_input_dir + - --vocab_input_dir + - inputPath: vocab_input_dir + - --rsp_input_dir + - inputPath: rsp_input_dir + - --pretrained_model_path + - inputPath: pretrained_model_path + - --model_input_dir + - inputPath: model_input_dir + - --model_output_dir + - outputPath: model_output_dir + - --eval_output_dir + - outputPath: eval_output_dir + - --architecture + - inputValue: architecture + - --pretrained_model_file + - inputValue: pretrained_model_file + - --run_mode + - inputValue: run_mode + - --load_from + - inputValue: load_from + - --max_epochs + - inputValue: max_epochs + - --lm_alpha + - inputValue: lm_alpha + - --batch_size + - inputValue: batch_size + - --batch_size_infer + - inputValue: batch_size_infer + - --learning_rate + - inputValue: learning_rate + - --max_msg_len + - inputValue: max_msg_len + - --max_rsp_len + - inputValue: max_rsp_len + - --decay_step + - inputValue: decay_step + - --decay_rate + - inputValue: decay_rate + - --warmup_proportion + - inputValue: warmup_proportion + - --loss_scale + - inputValue: loss_scale + - --tokenizer + - inputValue: tokenizer + - --gradient_accumulation_steps + - inputValue: gradient_accumulation_steps + - --optimizer + - inputValue: optimizer + - --validation_freq + - inputValue: validation_freq + - --save_freq + - inputValue: save_freq + - --manual_seed + - inputValue: manual_seed + - --infer_batches + - inputValue: infer_batches diff --git a/9_smart_reply_pytorch/smartreply/models/matching/driver.py b/9_smart_reply_pytorch/smartreply/models/matching/driver.py new file mode 100644 index 0000000..eaf3db2 --- /dev/null +++ b/9_smart_reply_pytorch/smartreply/models/matching/driver.py @@ -0,0 +1,6 @@ +import sys + + +if __name__ == '__main__': + print(sys.argv) + diff --git a/build_zip.py b/build_zip.py new file mode 100644 index 0000000..4f6b57a --- /dev/null +++ b/build_zip.py @@ -0,0 +1,50 @@ +import os + +from pathlib import Path +from zipfile import ZipFile, ZIP_DEFLATED + + +def create_zip(path): + with ZipFile(f'{path}.zip', 'w', ZIP_DEFLATED) as zipf: + for root, dirs, files in os.walk(path): + for f in files: + print(f'Zipping {f}...') + zipf.write(os.path.join(root, f)) + + +def make_zipfile(zip_file_path, folder_or_file_to_zip, exclude_function=None): + """Create an archive with exclusive files or directories. Adapted from shutil._make_zipfile. + + :param zip_file_path: Path of zip file to create. + :param folder_or_file_to_zip: Directory or file that will be zipped. + :param exclude_function: Function of exclude files or directories + """ + with ZipFile(zip_file_path, "w") as zf: + if os.path.isfile(folder_or_file_to_zip): + zf.write(folder_or_file_to_zip, os.path.basename(folder_or_file_to_zip)) + else: + for dirpath, dirnames, filenames in os.walk(folder_or_file_to_zip): + relative_dirpath = os.path.relpath(dirpath, folder_or_file_to_zip) + for name in sorted(dirnames): + full_path = os.path.normpath(os.path.join(dirpath, name)) + relative_path = os.path.normpath(os.path.join(relative_dirpath, name)) + if exclude_function and exclude_function(full_path): + continue + zf.write(full_path, relative_path) + for name in filenames: + full_path = os.path.normpath(os.path.join(dirpath, name)) + relative_path = os.path.normpath(os.path.join(relative_dirpath, name)) + if exclude_function and exclude_function(full_path): + continue + if os.path.isfile(full_path): + zf.write(full_path, relative_path) + + +if __name__ == '__main__': + for p in Path('.').iterdir(): + if p.is_dir(): + print(f'Processing {p} ...') + make_zipfile( + zip_file_path=str(p) + '.zip', + folder_or_file_to_zip=p, + ) diff --git a/multiple_specs.zip b/multiple_specs.zip new file mode 100644 index 0000000..7bc5d01 Binary files /dev/null and b/multiple_specs.zip differ diff --git a/multiple_specs_subfolder.zip b/multiple_specs_subfolder.zip new file mode 100644 index 0000000..206b264 Binary files /dev/null and b/multiple_specs_subfolder.zip differ diff --git a/multiple_specs_subfolder/add_rows.yaml b/multiple_specs_subfolder/add_rows.yaml index 762d20a..062c691 100644 --- a/multiple_specs_subfolder/add_rows.yaml +++ b/multiple_specs_subfolder/add_rows.yaml @@ -1,6 +1,6 @@ name: Add Rows id: b2ebdabd-217d-4915-86cc-5b05972f7270 -version: 0.0.114 +version: 0.0.114.1 isDeterministic: true category: Data Transformation description: Appends a set of rows from an input dataset to the end of another dataset. diff --git a/no_spec.zip b/no_spec.zip new file mode 100644 index 0000000..3412c07 Binary files /dev/null and b/no_spec.zip differ diff --git a/one_spec.zip b/one_spec.zip new file mode 100644 index 0000000..1734e91 Binary files /dev/null and b/one_spec.zip differ diff --git a/one_spec/clean_missing_data.yaml b/one_spec/clean_missing_data.yaml index a3c4eb2..7b4a0de 100644 --- a/one_spec/clean_missing_data.yaml +++ b/one_spec/clean_missing_data.yaml @@ -1,6 +1,6 @@ -name: Clean Missing Data -id: d2c5ca2f-7323-41a3-9b7e-da917c99f0c4 -version: 0.0.115.1 +name: My Awesome Module +id: d2c5ca2f-7323-41a3-900e-da917c99f0c4 +version: 0.0.6.6 isDeterministic: true category: Data Transformation description: Specifies how to handle the values missing from a dataset.