From 919936bed1cc5eeb07759edfdb0d0d137b9cb13c Mon Sep 17 00:00:00 2001 From: Waterkin <1055905911@qq.com> Date: Wed, 17 Jan 2024 10:37:54 +0800 Subject: [PATCH 1/2] Fix a bug while building website. Signed-off-by: Waterkin <1055905911@qq.com> --- .../docs/operators/feature/univariatefeatureselector.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/content/docs/operators/feature/univariatefeatureselector.md b/docs/content/docs/operators/feature/univariatefeatureselector.md index 873919e21..c12288c2e 100644 --- a/docs/content/docs/operators/feature/univariatefeatureselector.md +++ b/docs/content/docs/operators/feature/univariatefeatureselector.md @@ -220,3 +220,6 @@ for result in t_env.to_data_stream(output).execute_and_collect(): '\tOutput Value: ' + str(result[output_index])) ``` +{{< /tab >}} + +{{< /tab>}} \ No newline at end of file From 10007c1a8d995765073c8d7d2c820d46fa9bd20a Mon Sep 17 00:00:00 2001 From: Waterkin <1055905911@qq.com> Date: Wed, 17 Jan 2024 10:39:26 +0800 Subject: [PATCH 2/2] Add a Chinese version documentation. Signed-off-by: Waterkin <1055905911@qq.com> --- docs/config.toml | 11 + docs/content.zh/_index.md | 51 +++ docs/content.zh/docs/development/_index.md | 26 ++ .../docs/development/build-and-install.md | 111 ++++++ docs/content.zh/docs/development/iteration.md | 225 +++++++++++ docs/content.zh/docs/development/overview.md | 246 ++++++++++++ docs/content.zh/docs/development/types.md | 74 ++++ docs/content.zh/docs/operators/_index.md | 25 ++ .../docs/operators/classification/_index.md | 25 ++ .../docs/operators/classification/knn.md | 216 +++++++++++ .../operators/classification/linearsvc.md | 197 ++++++++++ .../classification/logisticregression.md | 364 +++++++++++++++++ .../operators/classification/naivebayes.md | 192 +++++++++ .../docs/operators/clustering/_index.md | 25 ++ .../clustering/agglomerativeclustering.md | 181 +++++++++ .../docs/operators/clustering/kmeans.md | 329 ++++++++++++++++ .../docs/operators/evaluation/_index.md | 25 ++ .../binaryclassificationevaluator.md | 192 +++++++++ .../docs/operators/feature/_index.md | 25 ++ .../docs/operators/feature/binarizer.md | 183 +++++++++ .../docs/operators/feature/bucketizer.md | 179 +++++++++ .../docs/operators/feature/countvectorizer.md | 182 +++++++++ docs/content.zh/docs/operators/feature/dct.md | 151 ++++++++ .../operators/feature/elementwiseproduct.md | 157 ++++++++ .../docs/operators/feature/featurehasher.md | 177 +++++++++ .../docs/operators/feature/hashingtf.md | 165 ++++++++ docs/content.zh/docs/operators/feature/idf.md | 172 ++++++++ .../docs/operators/feature/imputer.md | 196 ++++++++++ .../docs/operators/feature/indextostring.md | 184 +++++++++ .../docs/operators/feature/interaction.md | 169 ++++++++ .../operators/feature/kbinsdiscretizer.md | 185 +++++++++ .../docs/operators/feature/maxabsscaler.md | 180 +++++++++ .../docs/operators/feature/minhashlsh.md | 288 ++++++++++++++ .../docs/operators/feature/minmaxscaler.md | 180 +++++++++ .../docs/operators/feature/ngram.md | 155 ++++++++ .../docs/operators/feature/normalizer.md | 154 ++++++++ .../docs/operators/feature/onehotencoder.md | 161 ++++++++ .../operators/feature/onlinestandardscaler.md | 259 +++++++++++++ .../operators/feature/polynomialexpansion.md | 160 ++++++++ .../docs/operators/feature/randomsplitter.md | 148 +++++++ .../docs/operators/feature/regextokenizer.md | 156 ++++++++ .../docs/operators/feature/robustscaler.md | 211 ++++++++++ .../docs/operators/feature/sqltransformer.md | 142 +++++++ .../docs/operators/feature/standardscaler.md | 158 ++++++++ .../operators/feature/stopwordsremover.md | 165 ++++++++ .../docs/operators/feature/stringindexer.md | 219 +++++++++++ .../docs/operators/feature/tokenizer.md | 148 +++++++ .../feature/univariatefeatureselector.md | 225 +++++++++++ .../feature/variancethresholdselector.md | 189 +++++++++ .../docs/operators/feature/vectorassembler.md | 193 +++++++++ .../docs/operators/feature/vectorindexer.md | 210 ++++++++++ .../docs/operators/feature/vectorslicer.md | 158 ++++++++ docs/content.zh/docs/operators/functions.md | 236 +++++++++++ .../docs/operators/recommendation/_index.md | 25 ++ .../docs/operators/recommendation/swing.md | 194 ++++++++++ .../docs/operators/regression/_index.md | 25 ++ .../operators/regression/linearregression.md | 188 +++++++++ .../content.zh/docs/operators/stats/_index.md | 25 ++ .../docs/operators/stats/chisqtest.md | 187 +++++++++ docs/content.zh/docs/try-flink-ml/_index.md | 25 ++ .../docs/try-flink-ml/java/_index.md | 23 ++ .../java/build-your-own-project.md | 366 ++++++++++++++++++ .../docs/try-flink-ml/java/quick-start.md | 139 +++++++ .../docs/try-flink-ml/python/_index.md | 23 ++ .../docs/try-flink-ml/python/quick-start.md | 338 ++++++++++++++++ docs/content.zh/versions.md | 29 ++ 66 files changed, 10322 insertions(+) create mode 100644 docs/content.zh/_index.md create mode 100644 docs/content.zh/docs/development/_index.md create mode 100644 docs/content.zh/docs/development/build-and-install.md create mode 100644 docs/content.zh/docs/development/iteration.md create mode 100644 docs/content.zh/docs/development/overview.md create mode 100644 docs/content.zh/docs/development/types.md create mode 100644 docs/content.zh/docs/operators/_index.md create mode 100644 docs/content.zh/docs/operators/classification/_index.md create mode 100644 docs/content.zh/docs/operators/classification/knn.md create mode 100644 docs/content.zh/docs/operators/classification/linearsvc.md create mode 100644 docs/content.zh/docs/operators/classification/logisticregression.md create mode 100644 docs/content.zh/docs/operators/classification/naivebayes.md create mode 100644 docs/content.zh/docs/operators/clustering/_index.md create mode 100644 docs/content.zh/docs/operators/clustering/agglomerativeclustering.md create mode 100644 docs/content.zh/docs/operators/clustering/kmeans.md create mode 100644 docs/content.zh/docs/operators/evaluation/_index.md create mode 100644 docs/content.zh/docs/operators/evaluation/binaryclassificationevaluator.md create mode 100644 docs/content.zh/docs/operators/feature/_index.md create mode 100644 docs/content.zh/docs/operators/feature/binarizer.md create mode 100644 docs/content.zh/docs/operators/feature/bucketizer.md create mode 100644 docs/content.zh/docs/operators/feature/countvectorizer.md create mode 100644 docs/content.zh/docs/operators/feature/dct.md create mode 100644 docs/content.zh/docs/operators/feature/elementwiseproduct.md create mode 100644 docs/content.zh/docs/operators/feature/featurehasher.md create mode 100644 docs/content.zh/docs/operators/feature/hashingtf.md create mode 100644 docs/content.zh/docs/operators/feature/idf.md create mode 100644 docs/content.zh/docs/operators/feature/imputer.md create mode 100644 docs/content.zh/docs/operators/feature/indextostring.md create mode 100644 docs/content.zh/docs/operators/feature/interaction.md create mode 100644 docs/content.zh/docs/operators/feature/kbinsdiscretizer.md create mode 100644 docs/content.zh/docs/operators/feature/maxabsscaler.md create mode 100644 docs/content.zh/docs/operators/feature/minhashlsh.md create mode 100644 docs/content.zh/docs/operators/feature/minmaxscaler.md create mode 100644 docs/content.zh/docs/operators/feature/ngram.md create mode 100644 docs/content.zh/docs/operators/feature/normalizer.md create mode 100644 docs/content.zh/docs/operators/feature/onehotencoder.md create mode 100644 docs/content.zh/docs/operators/feature/onlinestandardscaler.md create mode 100644 docs/content.zh/docs/operators/feature/polynomialexpansion.md create mode 100644 docs/content.zh/docs/operators/feature/randomsplitter.md create mode 100644 docs/content.zh/docs/operators/feature/regextokenizer.md create mode 100644 docs/content.zh/docs/operators/feature/robustscaler.md create mode 100644 docs/content.zh/docs/operators/feature/sqltransformer.md create mode 100644 docs/content.zh/docs/operators/feature/standardscaler.md create mode 100644 docs/content.zh/docs/operators/feature/stopwordsremover.md create mode 100644 docs/content.zh/docs/operators/feature/stringindexer.md create mode 100644 docs/content.zh/docs/operators/feature/tokenizer.md create mode 100644 docs/content.zh/docs/operators/feature/univariatefeatureselector.md create mode 100644 docs/content.zh/docs/operators/feature/variancethresholdselector.md create mode 100644 docs/content.zh/docs/operators/feature/vectorassembler.md create mode 100644 docs/content.zh/docs/operators/feature/vectorindexer.md create mode 100644 docs/content.zh/docs/operators/feature/vectorslicer.md create mode 100644 docs/content.zh/docs/operators/functions.md create mode 100644 docs/content.zh/docs/operators/recommendation/_index.md create mode 100644 docs/content.zh/docs/operators/recommendation/swing.md create mode 100644 docs/content.zh/docs/operators/regression/_index.md create mode 100644 docs/content.zh/docs/operators/regression/linearregression.md create mode 100644 docs/content.zh/docs/operators/stats/_index.md create mode 100644 docs/content.zh/docs/operators/stats/chisqtest.md create mode 100644 docs/content.zh/docs/try-flink-ml/_index.md create mode 100644 docs/content.zh/docs/try-flink-ml/java/_index.md create mode 100644 docs/content.zh/docs/try-flink-ml/java/build-your-own-project.md create mode 100644 docs/content.zh/docs/try-flink-ml/java/quick-start.md create mode 100644 docs/content.zh/docs/try-flink-ml/python/_index.md create mode 100644 docs/content.zh/docs/try-flink-ml/python/quick-start.md create mode 100644 docs/content.zh/versions.md diff --git a/docs/config.toml b/docs/config.toml index 4fc5cff5f..667639f0a 100644 --- a/docs/config.toml +++ b/docs/config.toml @@ -71,3 +71,14 @@ pygmentsUseClasses = true [markup] [markup.goldmark.renderer] unsafe = true + +[languages] +[languages.en] + languageName = 'English' + contentDir = 'content' + weight = 1 + +[languages.zh] + languageName = '中文版' + contentDir = 'content.zh' + weight = 2 \ No newline at end of file diff --git a/docs/content.zh/_index.md b/docs/content.zh/_index.md new file mode 100644 index 000000000..b64b14b4d --- /dev/null +++ b/docs/content.zh/_index.md @@ -0,0 +1,51 @@ +--- +title: Apache Flink Machine Learning Library +type: docs +bookToc: false +--- + + +# Flink ML: Apache Flink Machine Learning Library + +Flink ML is a library which provides machine learning (ML) APIs and +infrastructures that simplify the building of ML pipelines. Users can implement +ML algorithms with the standard ML APIs and further use these infrastructures to +build ML pipelines for both training and inference jobs. + +{{< columns >}} +## Try Flink ML + +If you’re interested in playing around with Flink ML, check out our [quick +start]({{< ref "docs/try-flink-ml/java/quick-start" >}}). It provides a simple +example to submit and execute a Flink ML job on a Flink cluster. + +<---> + +## Get Help with Flink ML + +If you get stuck, check out our [community support +resources](https://flink.apache.org/community.html). In particular, Apache +Flink’s user mailing list is consistently ranked as one of the most active of +any Apache project, and is a great way to get help quickly. + +{{< /columns >}} + +Flink ML is developed under the umbrella of [Apache +Flink](https://flink.apache.org/). diff --git a/docs/content.zh/docs/development/_index.md b/docs/content.zh/docs/development/_index.md new file mode 100644 index 000000000..322929622 --- /dev/null +++ b/docs/content.zh/docs/development/_index.md @@ -0,0 +1,26 @@ +--- +title: Development +icon: +bold: true +sectionBreak: true +bookCollapseSection: true +weight: 2 +--- + diff --git a/docs/content.zh/docs/development/build-and-install.md b/docs/content.zh/docs/development/build-and-install.md new file mode 100644 index 000000000..6d4ae0646 --- /dev/null +++ b/docs/content.zh/docs/development/build-and-install.md @@ -0,0 +1,111 @@ +--- +title: "Building And Installing Flink ML From Source" +weight: 999 +type: docs +aliases: +- /development/build-and-install.html + +--- + + + +# Building And Installing Flink ML From Source + +This page covers how to build and install Flink ML from sources. + +## Build and Install Java SDK + +In order to build Flink ML you need the source code. Either [download the source +of a release](https://flink.apache.org/downloads.html) or [clone the git +repository](https://github.com/apache/flink-ml.git). + +In addition, you need **Maven 3** and a **JDK** (Java Development Kit). Flink ML +requires **at least Java 8** to build. + +To clone from git, enter: + +```bash +git clone https://github.com/apache/flink-ml.git +``` + +The simplest way of building Flink ML is by running: + +```bash +mvn clean install -DskipTests +``` + +This instructs [Maven](http://maven.apache.org/) (`mvn`) to first remove all +existing builds (`clean`) and then create a new Flink ML binary (`install`). + +Optionally, you can also specify the Flink version used by Flink ML with maven +profile. Currently, Flink ML supports running on Flink 1.15, 1.16, and 1.17. +For example, you can build Flink ML with Flink 1.16 by running: +```bash +mvn clean install -DskipTests -Pflink-1.16 +``` + +After the build finishes, you can acquire the build result in the following path +from the root directory of Flink ML: + +``` +./flink-ml-dist/target/flink-ml-*-bin/flink-ml*/ +``` + +The `mvn clean install` command would have installed the binary into your local +Maven repository so other projects can refer to it and grab it from the +repository. There is no additional step required for installation. + +## Build and Install Python SDK + +### Prerequisites + +1. Building Flink ML Java SDK + + If you want to build Flink ML's Python SDK that can be used for pip + installation, you must first build the Java SDK, as described in the section + above. + +2. Python version(3.7, or 3.8) is required + ```shell + $ python --version + # the version printed here must be 3.7 or 3.8 + ``` + +3. Install the dependencies with the following command: + ```shell + $ python -m pip install -r flink-ml-python/dev/dev-requirements.txt + ``` + +### Installation + +Then go to the root directory of Flink ML source code and run this command to +build the sdist package of `apache-flink-ml`: + +```shell +cd flink-ml-python; python setup.py sdist; cd ..; +``` + +The sdist package of `apache-flink-ml` will be found under +`./flink-ml-python/dist/`. It could be installed as follows: + +```shell +python -m pip install flink-ml-python/dist/*.tar.gz +``` + diff --git a/docs/content.zh/docs/development/iteration.md b/docs/content.zh/docs/development/iteration.md new file mode 100644 index 000000000..6e656beec --- /dev/null +++ b/docs/content.zh/docs/development/iteration.md @@ -0,0 +1,225 @@ +--- +title: "Iteration" +weight: 2 +type: docs +aliases: +- /development/iteration.html +--- + + +# Iteration + +Iteration is a basic building block for a ML library. In machine learning +algorithms, iteration might be used in offline or online training process. In +general, two types of iterations are required and Flink ML supports both of them +in order to provide the infrastructure for a variety of algorithms. + +1. **Bounded Iteration**: Usually used in the offline case. In this case the + algorithm usually trains on a bounded dataset, it updates the parameters for + multiple rounds until convergence. +2. **Unbounded Iteration**: Usually used in the online case, in this case the + algorithm usually trains on an unbounded dataset. It accumulates a mini-batch + of data and then do one update to the parameters. + +## Iteration Paradigm + +An iterative algorithm has the following behavior pattern: + +- The iterative algorithm has an ***iteration body*** that is repeatedly invoked + until some termination criteria is reached (e.g. after a user-specified number + of epochs has been reached). An iteration body is a subgraph of operators that + implements the computation logic of e.g. an iterative machine learning + algorithm, whose outputs might be fed back as the inputs of this subgraph. +- In each invocation, the iteration body updates the model parameters based on + the user-provided data as well as the most recent model parameters. +- The iterative algorithm takes as inputs the user-provided data and the initial + model parameters. +- The iterative algorithm could output arbitrary user-defined information, such + as the loss after each epoch, or the final model parameters. + +Therefore, the behavior of an iterative algorithm could be characterized with +the following iteration paradigm (w.r.t. Flink concepts): + +- An iteration-body is a Flink subgraph with the following inputs and outputs: + - Inputs: **model-variables** (as a list of data streams) and + **user-provided-data** (as another list of data streams) + - Outputs: **feedback-model-variables** (as a list of data streams) and + **user-observed-outputs** (as a list of data streams) +- A **termination-condition** that specifies when the iterative execution of the + iteration body should terminate. +- In order to execute an iteration body, a user needs to execute the iteration + body with the following inputs, and gets the following outputs. + - Inputs: **initial-model-variables** (as a list of bounded data streams) and + **user-provided-data** (as a list of data streams) + - Outputs: the **user-observed-output** emitted by the iteration body. + +It is important to note that the **model-variables** expected by the iteration +body is not the same as the **initial-model-variables** provided by the user. +Instead, **model-variables** are computed as the union of the +**feedback-model-variables** (emitted by the iteration body) and the +**initial-model-variables** (provided by the caller of the iteration body). +Flink ML provides utility class (see Iterations) to run an iteration-body with +the user-provided inputs. + +The figure below summarizes the iteration paradigm described above. + +{{< mermaid >}} +flowchart LR + +subgraph Iteration Body +union1 +union2 +node11 +node12 +node21 +node22 +nodeX +end + +input0 --> node11 + +union1 -. feedback .- node12 +input1 --> union1 +union1 --> node11 +node11 --> nodeX +nodeX --> node12 +node12 --> output1 + +input2 --> union2 +union2 --> node21 +node21 --> nodeX +nodeX --> node22 +node22 --> output2 +union2 -. feedback .- node22 + +input0[non-iterate input] +input1[iterate input] +input2[iterate input] +union1[union] +union2[union] +node11( ) +node12( ) +nodeX( ) +node21( ) +node22( ) +output1[output] +output2[output] + +{{< /mermaid >}} + +## API + +The main entry of Flink ML's iteration lies in `Iterations` class. It mainly +provides two public methods and users may choose to use either of them based on +whether the input data is bounded or unbounded. + +```java +public class Iterations { + public static DataStreamList iterateUnboundedStreams( + DataStreamList initVariableStreams, DataStreamList dataStreams, IterationBody body) {...} + ... + public static DataStreamList iterateBoundedStreamsUntilTermination( + DataStreamList initVariableStreams, + ReplayableDataStreamList dataStreams, + IterationConfig config, + IterationBody body){...} +} +``` + +To construct an iteration, Users are required to provide + +- `initVariableStreams`: the initial values of the variable data streams which + would be updated in each round. +- `dataStreams`: the other data streams used inside the iteration, but would not + be updated. +- `iterationBody`: specifies the subgraph to update the variable streams and the + outputs. + +The `IterationBody` will be invoked with two parameters: The first parameter is +a list of input variable streams, which are created as the union of the initial +variable streams and the corresponding feedback variable streams (returned by +the iteration body); The second parameter is the data streams given to this +method. + +```java +public interface IterationBody extends Serializable { + ... + IterationBodyResult process(DataStreamList variableStreams, DataStreamList dataStreams); + ... +} +``` + +During the execution of iteration body, each of the records involved in the +iteration has an epoch attached, which marks the progress of the iteration. The +epoch is computed as: + +- All records in the initial variable streams and initial data streams has epoch + = 0. +- For any record emitted by this operator into a non-feedback stream, the epoch + of this emitted record = the epoch of the input record that triggers this + emission. If this record is emitted by onEpochWatermarkIncremented(), then the + epoch of this record = epochWatermark. +- For any record emitted by this operator into a feedback variable stream, the + epoch of the emitted record = the epoch of the input record that triggers this + emission + 1. + +The framework would deliver notification at the end of each epoch to operators +and UDFs that implements `IterationListener`. + +```java +public interface IterationListener { + void onEpochWatermarkIncremented(int epochWatermark, Context context, Collector collector) + throws Exception; + ... + void onIterationTerminated(Context context, Collector collector) throws Exception; +} +``` + +## Example Usage + +Example codes of utilizing iterations is as below。 + +```java +DataStream initParameters = ... +DataStream> dataset = ... + +DataStreamList resultStreams = Iterations.iterateUnboundedStreams( + DataStreamList.of(initParameters), + ReplayableDataStreamList.notReplay(dataset), + IterationConfig.newBuilder().setOperatorRoundMode(ALL_ROUND).build(); + (variableStreams, dataStreams) -> { + DataStream modelUpdate = variableStreams.get(0); + DataStream> dataset = dataStreams.get(0); + DataStream newModelUpdate = ... + DataStream modelOutput = ... + return new IterationBodyResult( + DataStreamList.of(newModelUpdate), + DataStreamList.of(modelOutput) +}); + +DataStream finalModel = resultStreams.get("final_model"); +``` + +- `initParameters`: input data that needs to be transmitted through feedback + edge. +- `dataset`: input data that does not need to be tarnsmitted through feed back + edge. +- `newModelUpdate`: data to be transmitted through feedback edge +- `modelOutput`: final output of the iteration body diff --git a/docs/content.zh/docs/development/overview.md b/docs/content.zh/docs/development/overview.md new file mode 100644 index 000000000..5be08ebe4 --- /dev/null +++ b/docs/content.zh/docs/development/overview.md @@ -0,0 +1,246 @@ +--- +title: "Overview" +weight: 1 +type: docs +aliases: +- /development/overview.html +--- + + +# Overview + +This document provides a brief introduction to the basic concepts in Flink ML. + +## Table API + +Flink ML's API is based on Flink's Table API. The Table API is a +language-integrated query API for Java, Scala, and Python that allows the +composition of queries from relational operators such as selection, filter, and +join in a very intuitive way. + +Table API allows the usage of a wide range of data types. [Flink Document Data +Types](https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/dev/table/types/) +page provides a list of supported types. In addition to these types, Flink ML +also provides support for `Vector` Type. + +The Table API integrates seamlessly with Flink’s DataStream API. You can easily +switch between all APIs and libraries which build upon them. Please refer to +Flink's document for how to convert between `Table` and `DataStream`, as well as +other usage of Flink Table API. + +## Stage + +A `Stage` is a node in a `Pipeline` or `Graph`. It is the fundamental component +in Flink ML. This interface is only a concept, and does not have any actual +functionality. Its subclasses include the follows. + +- `Estimator`: An `Estimator` is a `Stage` that is reponsible for the training + process in machine learning algorithms. It implements a `fit()` method that + takes a list of tables and produces a `Model`. + +- `AlgoOperator`: An `AlgoOperator` is a `Stage` that is used to encode generic + multi-input multi-output computation logic. It implements a `transform()` + method, which applies certain computation logic on the given input tables and + returns a list of result tables. + +- `Transformer`: A `Transformer` is an `AlgoOperator` with the semantic + difference that it encodes the Transformation logic, such that a record in the + output typically corresponds to one record in the input. In contrast, an + `AlgoOperator` is a better fit to express aggregation logic where a record in + the output could be computed from an arbitrary number of records in the input. + +- `Model`: A `Model` is a `Transformer` with the extra APIs to set and get model + data. It is typically generated by fitting an `Estimator` on a list of tables. + It provides `getModelData()` and `setModelData()`, which allows users to + explicitly read or write model data tables to the transformer. Each table + could be an unbounded stream of model data changes. + +A typical usage of `Stage` is to create an `Estimator` instance first, trigger +its training process by invoking its `fit()` method, and to perform predictions +with the resulting `Model` instance. This example usage is shown in the code +below. + +```java +// Suppose SumModel is a concrete subclass of Model, SumEstimator is a concrete subclass of Estimator. + +Table trainData = ...; +Table predictData = ...; + +SumEstimator estimator = new SumEstimator(); +SumModel model = estimator.fit(trainData); +Table predictResult = model.transform(predictData)[0]; +``` + +## Builders + +In order to organize Flink ML stages into more complexed format so as to achieve +advanced functionalities, like chaining data processing and machine learning +algorithms together, Flink ML provides APIs that help to manage the relationship +and structure of stages in Flink jobs. The entry of these APIs includes +`Pipeline` and `Graph`. + +### Pipeline + +A `Pipeline` acts as an `Estimator`. It consists of an ordered list of stages, +each of which could be an `Estimator`, `Model`, `Transformer` or `AlgoOperator`. +Its `fit()` method goes through all stages of this pipeline in order and does +the following on each stage until the last `Estimator` (inclusive). + +- If a stage is an `Estimator`, it would invoke the stage's `fit()` method with + the input tables to generage a `Model`. And if there is `Estimator` after this + stage, it would transform the input tables using the generated `Model` to get + result tables, then pass the result tables to the next stage as inputs. +- If a stage is an `AlgoOperator` AND there is `Estimator` after this stage, it + would transform the input tables using this stage to get result tables, then + pass the result tables to the next stage as inputs. + +After all the `Estimators` are trained to fit their input tables, a new +`PipelineModel` will be created with the same stages in this pipeline, except +that all the `Estimator`s in the `PipelineModel` are replaced with the models +generated in the above process. + +A `PipelineModel` acts as a `Model`. It consists of an ordered list of stages, +each of which could be a `Model`, `Transformer` or `AlgoOperator`. Its +`transform()` method applies all stages in this `PipelineModel` on the input +tables in order. The output of one stage is used as the input of the next stage +(if any). The output of the last stage is returned as the result of this method. + +A `Pipeline` can be created by passing a list of `Stage`s to Pipeline's +constructor. For example, + +```java +// Suppose SumModel is a concrete subclass of Model, SumEstimator is a concrete subclass of Estimator. + +Model modelA = new SumModel().setModelData(tEnv.fromValues(10)); +Estimator estimatorA = new SumEstimator(); +Model modelB = new SumModel().setModelData(tEnv.fromValues(30)); + +List> stages = Arrays.asList(modelA, estimatorA, modelB); +Estimator estimator = new Pipeline(stages); +``` + +The commands above creates a Pipeline like follows. + +{{< mermaid >}} + +graph LR + +empty0[ ] --> modelA --> estimatorA --> modelB --> empty1[ ] + +style empty0 fill:#FFFFFF, stroke:#FFFFFF; +style empty1 fill:#FFFFFF, stroke:#FFFFFF; + +{{< /mermaid >}} + +### Graph + +A `Graph` acts as an `Estimator`. A `Graph` consists of a DAG of stages, each of +which could be an `Estimator`, `Model`, `Transformer` or `AlgoOperator`. When +`Graph::fit` is called, the stages are executed in a topologically-sorted order. +If a stage is an `Estimator`, its `Estimator::fit` method will be called on the +input tables (from the input edges) to fit a `Model`. Then the `Model` will be +used to transform the input tables and produce output tables to the output +edges. If a stage is an `AlgoOperator`, its `AlgoOperator::transform` method +will be called on the input tables and produce output tables to the output +edges. The `GraphModel` fitted from a `Graph` consists of the fitted `Models` +and `AlgoOperators`, corresponding to the `Graph`'s stages. + +A `GraphModel` acts as a `Model`. A `GraphModel` consists of a DAG of stages, +each of which could be an `Estimator`, `Model`, `Transformer` or `AlgoOperator`. +When `GraphModel::transform` is called, the stages are executed in a +topologically-sorted order. When a stage is executed, its +`AlgoOperator::transform` method will be called on the input tables (from the +input edges) and produce output tables to the output edges. + +A `Graph` can be constructed via the `GraphBuilder` class, which provides +methods like `addAlgoOperator` or `addEstimator` to help adding stages to a +graph. Flink ML also introduces `TableId` class to represent the input/output of +a stage and to help express the relationship between stages in a graph, thus +allowing users to construct the DAG before they have the concrete tables +available. + +The example codes below shows how to build a `Graph`. + +```java +// Suppose SumModel is a concrete subclass of Model. + +GraphBuilder builder = new GraphBuilder(); +// Creates nodes. +SumModel stage1 = new SumModel().setModelData(tEnv.fromValues(1)); +SumModel stage2 = new SumModel(); +SumModel stage3 = new SumModel().setModelData(tEnv.fromValues(3)); +// Creates inputs and modelDataInputs. +TableId input = builder.createTableId(); +TableId modelDataInput = builder.createTableId(); +// Feeds inputs and gets outputs. +TableId output1 = builder.addAlgoOperator(stage1, input)[0]; +TableId output2 = builder.addAlgoOperator(stage2, output1)[0]; +builder.setModelDataOnModel(stage2, modelDataInput); +TableId output3 = builder.addAlgoOperator(stage3, output2)[0]; +TableId modelDataOutput = builder.getModelDataFromModel(stage3)[0]; + +// Builds a Model from the graph. +TableId[] inputs = new TableId[] {input}; +TableId[] outputs = new TableId[] {output3}; +TableId[] modelDataInputs = new TableId[] {modelDataInput}; +TableId[] modelDataOutputs = new TableId[] {modelDataOutput}; +Model model = builder.buildModel(inputs, outputs, modelDataInputs, modelDataOutputs); +``` + +The code above constructs a `Graph` like follows. + +{{< mermaid >}} + +graph LR + +empty0[ ] --> |input| stage1 +stage1 --> |output1| stage2 +empty1[ ] --> |modelDataInput| stage2 +stage2 --> |output2| stage3 +stage3 --> |output3| empty3[ ] +stage3 --> |modelDataOutput| empty2[ ] + +style empty0 fill:#FFFFFF, stroke:#FFFFFF; +style empty1 fill:#FFFFFF, stroke:#FFFFFF; +style empty2 fill:#FFFFFF, stroke:#FFFFFF; +style empty3 fill:#FFFFFF, stroke:#FFFFFF; + +{{< /mermaid >}} + +## Parameter + +Flink ML `Stage` is a subclass of `WithParams`, which provides a uniform API to +get and set parameters. + +A `Param` is the definition of a parameter, including name, class, description, +default value and the validator. + +In order to set the parameter of an algorithm, users can use any of the +following ways. + +- Invoke the parameter's specific set method. For example, in order to set `K`, + the number of clusters, of a K-means algorithm, users can directly invoke + `setK()` method on that `KMeans` instance. +- Pass a parameter map containing new values to the stage through + `ParamUtils.updateExistingParams()` method. + +If a `Model` is generated through an `Estimator`'s `fit()` method, the `Model` +would inherit the `Estimator` object's parameters. Thus there is no need to set +the parameters for a second time if the parameters are not changed. diff --git a/docs/content.zh/docs/development/types.md b/docs/content.zh/docs/development/types.md new file mode 100644 index 000000000..aba0388aa --- /dev/null +++ b/docs/content.zh/docs/development/types.md @@ -0,0 +1,74 @@ +--- +title: "Data Types" +weight: 3 +type: docs +aliases: +- /development/types.html +--- + + +# Data Types + +Flink ML supports all data types that have been supported by Flink Table API, as +well as data types listed in sections below. + +## Vector + +Flink ML provides support for vectors of double values. A `Vector` in Flink ML +can be either dense(`DenseVector`) or sparse(`SparseVector`), depending on how +users create them accordig to the vector's sparsity. Each vector is initialized +with a fixed size and users may get or set the double value of any 0-based index +location in the vector. + +Flink ML also has a class named `Vectors` providing utility methods for +instantiating vectors. + +{{< tabs vector >}} + +{{< tab "Java">}} +```java +int n = 4; +int[] indices = new int[] {0, 2, 3}; +double[] values = new double[] {0.1, 0.3, 0.4}; + +SparseVector vector = Vectors.sparse(n, indices, values); +``` +{{< /tab>}} + +{{< tab "Python">}} +```python +# Create a dense vector of 64-bit floats from a Python list or numbers. +>>> Vectors.dense([1, 2, 3]) +DenseVector([1.0, 2.0, 3.0]) +>>> Vectors.dense(1.0, 2.0) +DenseVector([1.0, 2.0]) + +# Create a sparse vector, using either a dict, a list of (index, value) pairs, or two separate +# arrays of indices and values. + +>>> Vectors.sparse(4, {1: 1.0, 3: 5.5}) +SparseVector(4, {1: 1.0, 3: 5.5}) +>>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) +SparseVector(4, {1: 1.0, 3: 5.5}) +>>> Vectors.sparse(4, [1, 3], [1.0, 5.5]) +SparseVector(4, {1: 1.0, 3: 5.5}) +``` +{{< /tab>}} +{{< /tabs>}} \ No newline at end of file diff --git a/docs/content.zh/docs/operators/_index.md b/docs/content.zh/docs/operators/_index.md new file mode 100644 index 000000000..5038fcfa7 --- /dev/null +++ b/docs/content.zh/docs/operators/_index.md @@ -0,0 +1,25 @@ +--- +title: Operators +icon: +bold: true +bookCollapseSection: true +weight: 3 +--- + diff --git a/docs/content.zh/docs/operators/classification/_index.md b/docs/content.zh/docs/operators/classification/_index.md new file mode 100644 index 000000000..007d663f3 --- /dev/null +++ b/docs/content.zh/docs/operators/classification/_index.md @@ -0,0 +1,25 @@ +--- +title: Classification +bookCollapseSection: true +weight: 1 +aliases: + - /operators/feature/ +--- + diff --git a/docs/content.zh/docs/operators/classification/knn.md b/docs/content.zh/docs/operators/classification/knn.md new file mode 100644 index 000000000..0724f2daf --- /dev/null +++ b/docs/content.zh/docs/operators/classification/knn.md @@ -0,0 +1,216 @@ +--- +title: "KNN" +type: docs +aliases: +- /operators/classification/knn.html +--- + + +## KNN + +K Nearest Neighbor(KNN) is a classification algorithm. The basic assumption of +KNN is that if most of the nearest K neighbors of the provided sample belong to +the same label, then it is highly probable that the provided sample also belongs +to that label. + +### Input Columns + +| Param name | Type | Default | Description | +| :---------- | :------ | :----------- |:------------------| +| featuresCol | Vector | `"features"` | Feature vector. | +| labelCol | Integer | `"label"` | Label to predict. | + +### Output Columns + +| Param name | Type | Default | Description | +| :------------ | :------ | :------------- |:-----------------| +| predictionCol | Integer | `"prediction"` | Predicted label. | + +### Parameters + +Below are the parameters required by `KnnModel`. + +| Key | Default | Type | Required | Description | +|---------------| -------------- | ------- | -------- | -------------------------------- | +| k | `5` | Integer | no | The number of nearest neighbors. | +| featuresCol | `"features"` | String | no | Features column name. | +| predictionCol | `"prediction"` | String | no | Prediction column name. | + +`Knn` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +| -------- | --------- | ------ | -------- | ------------------ | +| labelCol | `"label"` | String | no | Label column name. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} +```java +import org.apache.flink.ml.classification.knn.Knn; +import org.apache.flink.ml.classification.knn.KnnModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a Knn model and uses it for classification. */ +public class KnnExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements( + Row.of(Vectors.dense(2.0, 3.0), 1.0), + Row.of(Vectors.dense(2.1, 3.1), 1.0), + Row.of(Vectors.dense(200.1, 300.1), 2.0), + Row.of(Vectors.dense(200.2, 300.2), 2.0), + Row.of(Vectors.dense(200.3, 300.3), 2.0), + Row.of(Vectors.dense(200.4, 300.4), 2.0), + Row.of(Vectors.dense(200.4, 300.4), 2.0), + Row.of(Vectors.dense(200.6, 300.6), 2.0), + Row.of(Vectors.dense(2.1, 3.1), 1.0), + Row.of(Vectors.dense(2.1, 3.1), 1.0), + Row.of(Vectors.dense(2.1, 3.1), 1.0), + Row.of(Vectors.dense(2.1, 3.1), 1.0), + Row.of(Vectors.dense(2.3, 3.2), 1.0), + Row.of(Vectors.dense(2.3, 3.2), 1.0), + Row.of(Vectors.dense(2.8, 3.2), 3.0), + Row.of(Vectors.dense(300., 3.2), 4.0), + Row.of(Vectors.dense(2.2, 3.2), 1.0), + Row.of(Vectors.dense(2.4, 3.2), 5.0), + Row.of(Vectors.dense(2.5, 3.2), 5.0), + Row.of(Vectors.dense(2.5, 3.2), 5.0), + Row.of(Vectors.dense(2.1, 3.1), 1.0)); + Table trainTable = tEnv.fromDataStream(trainStream).as("features", "label"); + + DataStream predictStream = + env.fromElements( + Row.of(Vectors.dense(4.0, 4.1), 5.0), Row.of(Vectors.dense(300, 42), 2.0)); + Table predictTable = tEnv.fromDataStream(predictStream).as("features", "label"); + + // Creates a Knn object and initializes its parameters. + Knn knn = new Knn().setK(4); + + // Trains the Knn Model. + KnnModel knnModel = knn.fit(trainTable); + + // Uses the Knn Model for predictions. + Table outputTable = knnModel.transform(predictTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector features = (DenseVector) row.getField(knn.getFeaturesCol()); + double expectedResult = (Double) row.getField(knn.getLabelCol()); + double predictionResult = (Double) row.getField(knn.getPredictionCol()); + System.out.printf( + "Features: %-15s \tExpected Result: %s \tPrediction Result: %s\n", + features, expectedResult, predictionResult); + } + } +} + +``` +{{< /tab>}} + +{{< tab "Python">}} +```python +# Simple program that trains a Knn model and uses it for classification. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.classification.knn import KNN +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input training and prediction data +train_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([2.0, 3.0]), 1.0), + (Vectors.dense([2.1, 3.1]), 1.0), + (Vectors.dense([200.1, 300.1]), 2.0), + (Vectors.dense([200.2, 300.2]), 2.0), + (Vectors.dense([200.3, 300.3]), 2.0), + (Vectors.dense([200.4, 300.4]), 2.0), + (Vectors.dense([200.4, 300.4]), 2.0), + (Vectors.dense([200.6, 300.6]), 2.0), + (Vectors.dense([2.1, 3.1]), 1.0), + (Vectors.dense([2.1, 3.1]), 1.0), + (Vectors.dense([2.1, 3.1]), 1.0), + (Vectors.dense([2.1, 3.1]), 1.0), + (Vectors.dense([2.3, 3.2]), 1.0), + (Vectors.dense([2.3, 3.2]), 1.0), + (Vectors.dense([2.8, 3.2]), 3.0), + (Vectors.dense([300., 3.2]), 4.0), + (Vectors.dense([2.2, 3.2]), 1.0), + (Vectors.dense([2.4, 3.2]), 5.0), + (Vectors.dense([2.5, 3.2]), 5.0), + (Vectors.dense([2.5, 3.2]), 5.0), + (Vectors.dense([2.1, 3.1]), 1.0) + ], + type_info=Types.ROW_NAMED( + ['features', 'label'], + [DenseVectorTypeInfo(), Types.DOUBLE()]))) + +predict_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([4.0, 4.1]), 5.0), + (Vectors.dense([300, 42]), 2.0), + ], + type_info=Types.ROW_NAMED( + ['features', 'label'], + [DenseVectorTypeInfo(), Types.DOUBLE()]))) + +# create a knn object and initialize its parameters +knn = KNN().set_k(4) + +# train the knn model +model = knn.fit(train_data) + +# use the knn model for predictions +output = model.transform(predict_data)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + features = result[field_names.index(knn.get_features_col())] + expected_result = result[field_names.index(knn.get_label_col())] + actual_result = result[field_names.index(knn.get_prediction_col())] + print('Features: ' + str(features) + ' \tExpected Result: ' + str(expected_result) + + ' \tActual Result: ' + str(actual_result)) +``` +{{< /tab>}} + +{{< /tabs>}} + diff --git a/docs/content.zh/docs/operators/classification/linearsvc.md b/docs/content.zh/docs/operators/classification/linearsvc.md new file mode 100644 index 000000000..5b134a995 --- /dev/null +++ b/docs/content.zh/docs/operators/classification/linearsvc.md @@ -0,0 +1,197 @@ +--- +title: "Linear SVC" +type: docs +aliases: +- /operators/classification/linearsvc.html +--- + + + +## Linear Support Vector Machine + +Linear Support Vector Machine (Linear SVC) is an algorithm that attempts to find +a hyperplane to maximize the distance between classified samples. + +### Input Columns + +| Param name | Type | Default | Description | +| :---------- | :------ | :----------- |:------------------| +| featuresCol | Vector | `"features"` | Feature vector. | +| labelCol | Integer | `"label"` | Label to predict. | +| weightCol | Double | `"weight"` | Weight of sample. | + +### Output Columns + +| Param name | Type | Default | Description | +| :--------------- | :------ | :---------------- |:-----------------------------------------| +| predictionCol | Integer | `"prediction"` | Label of the max probability. | +| rawPredictionCol | Vector | `"rawPrediction"` | Vector of the probability of each label. | + +### Parameters + +Below are the parameters required by `LinearSVCModel`. + +| Key | Default | Type | Required | Description | +|------------------|-------------------|--------|----------|-------------------------------------------------------------------------| +| featuresCol | `"features"` | String | no | Features column name. | +| predictionCol | `"prediction"` | String | no | Prediction column name. | +| rawPredictionCol | `"rawPrediction"` | String | no | Raw prediction column name. | +| threshold | `0.0` | Double | no | Threshold in binary classification prediction applied to rawPrediction. | + +`LinearSVC` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +| --------------- | --------- | ------- | -------- | ----------------------------------------------- | +| labelCol | `"label"` | String | no | Label column name. | +| weightCol | `null` | String | no | Weight column name. | +| maxIter | `20` | Integer | no | Maximum number of iterations. | +| reg | `0.` | Double | no | Regularization parameter. | +| elasticNet | `0.` | Double | no | ElasticNet parameter. | +| learningRate | `0.1` | Double | no | Learning rate of optimization method. | +| globalBatchSize | `32` | Integer | no | Global batch size of training algorithms. | +| tol | `1e-6` | Double | no | Convergence tolerance for iterative algorithms. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.classification.linearsvc.LinearSVC; +import org.apache.flink.ml.classification.linearsvc.LinearSVCModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a LinearSVC model and uses it for classification. */ +public class LinearSVCExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(Vectors.dense(1, 2, 3, 4), 0., 1.), + Row.of(Vectors.dense(2, 2, 3, 4), 0., 2.), + Row.of(Vectors.dense(3, 2, 3, 4), 0., 3.), + Row.of(Vectors.dense(4, 2, 3, 4), 0., 4.), + Row.of(Vectors.dense(5, 2, 3, 4), 0., 5.), + Row.of(Vectors.dense(11, 2, 3, 4), 1., 1.), + Row.of(Vectors.dense(12, 2, 3, 4), 1., 2.), + Row.of(Vectors.dense(13, 2, 3, 4), 1., 3.), + Row.of(Vectors.dense(14, 2, 3, 4), 1., 4.), + Row.of(Vectors.dense(15, 2, 3, 4), 1., 5.)); + Table inputTable = tEnv.fromDataStream(inputStream).as("features", "label", "weight"); + + // Creates a LinearSVC object and initializes its parameters. + LinearSVC linearSVC = new LinearSVC().setWeightCol("weight"); + + // Trains the LinearSVC Model. + LinearSVCModel linearSVCModel = linearSVC.fit(inputTable); + + // Uses the LinearSVC Model for predictions. + Table outputTable = linearSVCModel.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector features = (DenseVector) row.getField(linearSVC.getFeaturesCol()); + double expectedResult = (Double) row.getField(linearSVC.getLabelCol()); + double predictionResult = (Double) row.getField(linearSVC.getPredictionCol()); + DenseVector rawPredictionResult = + (DenseVector) row.getField(linearSVC.getRawPredictionCol()); + System.out.printf( + "Features: %-25s \tExpected Result: %s \tPrediction Result: %s \tRaw Prediction Result: %s\n", + features, expectedResult, predictionResult, rawPredictionResult); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a LinearSVC model and uses it for classification. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.classification.linearsvc import LinearSVC +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([1, 2, 3, 4]), 0., 1.), + (Vectors.dense([2, 2, 3, 4]), 0., 2.), + (Vectors.dense([3, 2, 3, 4]), 0., 3.), + (Vectors.dense([4, 2, 3, 4]), 0., 4.), + (Vectors.dense([5, 2, 3, 4]), 0., 5.), + (Vectors.dense([11, 2, 3, 4]), 1., 1.), + (Vectors.dense([12, 2, 3, 4]), 1., 2.), + (Vectors.dense([13, 2, 3, 4]), 1., 3.), + (Vectors.dense([14, 2, 3, 4]), 1., 4.), + (Vectors.dense([15, 2, 3, 4]), 1., 5.), + ], + type_info=Types.ROW_NAMED( + ['features', 'label', 'weight'], + [DenseVectorTypeInfo(), Types.DOUBLE(), Types.DOUBLE()]) + )) + +# create a linear svc object and initialize its parameters +linear_svc = LinearSVC().set_weight_col('weight') + +# train the linear svc model +model = linear_svc.fit(input_table) + +# use the linear svc model for predictions +output = model.transform(input_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + features = result[field_names.index(linear_svc.get_features_col())] + expected_result = result[field_names.index(linear_svc.get_label_col())] + prediction_result = result[field_names.index(linear_svc.get_prediction_col())] + raw_prediction_result = result[field_names.index(linear_svc.get_raw_prediction_col())] + print('Features: ' + str(features) + ' \tExpected Result: ' + str(expected_result) + + ' \tPrediction Result: ' + str(prediction_result) + + ' \tRaw Prediction Result: ' + str(raw_prediction_result)) +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/classification/logisticregression.md b/docs/content.zh/docs/operators/classification/logisticregression.md new file mode 100644 index 000000000..b45b8480a --- /dev/null +++ b/docs/content.zh/docs/operators/classification/logisticregression.md @@ -0,0 +1,364 @@ +--- +title: "Logistic Regression" +type: docs +aliases: +- /operators/classification/logisticregression.html +--- + + +## Logistic Regression + +Logistic regression is a special case of the Generalized Linear Model. It is +widely used to predict a binary response. + +### Input Columns + +| Param name | Type | Default | Description | +| :---------- | :------ | :----------- |:------------------| +| featuresCol | Vector | `"features"` | Feature vector. | +| labelCol | Integer | `"label"` | Label to predict. | +| weightCol | Double | `"weight"` | Weight of sample. | + +### Output Columns + +| Param name | Type | Default | Description | +| :--------------- | :------ | :---------------- |:-----------------------------------------| +| predictionCol | Integer | `"prediction"` | Label of the max probability. | +| rawPredictionCol | Vector | `"rawPrediction"` | Vector of the probability of each label. | + +### Parameters + +Below are the parameters required by `LogisticRegressionModel`. + +| Key | Default | Type | Required | Description | +| ---------------- | ----------------- | ------ | -------- | --------------------------- | +| featuresCol | `"features"` | String | no | Features column name. | +| predictionCol | `"prediction"` | String | no | Prediction column name. | +| rawPredictionCol | `"rawPrediction"` | String | no | Raw prediction column name. | + +`LogisticRegression` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|-----------------|-----------|---------|----------|---------------------------------------------------------------------------| +| labelCol | `"label"` | String | no | Label column name. | +| weightCol | `null` | String | no | Weight column name. | +| maxIter | `20` | Integer | no | Maximum number of iterations. | +| reg | `0.` | Double | no | Regularization parameter. | +| elasticNet | `0.` | Double | no | ElasticNet parameter. | +| learningRate | `0.1` | Double | no | Learning rate of optimization method. | +| globalBatchSize | `32` | Integer | no | Global batch size of training algorithms. | +| tol | `1e-6` | Double | no | Convergence tolerance for iterative algorithms. | +| multiClass | `"auto"` | String | no | Classification type. Supported values: "auto", "binomial", "multinomial". | + +### Examples +{{< tabs examples >}} + +{{< tab "Java">}} +```java +import org.apache.flink.ml.classification.logisticregression.LogisticRegression; +import org.apache.flink.ml.classification.logisticregression.LogisticRegressionModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a LogisticRegression model and uses it for classification. */ +public class LogisticRegressionExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(Vectors.dense(1, 2, 3, 4), 0., 1.), + Row.of(Vectors.dense(2, 2, 3, 4), 0., 2.), + Row.of(Vectors.dense(3, 2, 3, 4), 0., 3.), + Row.of(Vectors.dense(4, 2, 3, 4), 0., 4.), + Row.of(Vectors.dense(5, 2, 3, 4), 0., 5.), + Row.of(Vectors.dense(11, 2, 3, 4), 1., 1.), + Row.of(Vectors.dense(12, 2, 3, 4), 1., 2.), + Row.of(Vectors.dense(13, 2, 3, 4), 1., 3.), + Row.of(Vectors.dense(14, 2, 3, 4), 1., 4.), + Row.of(Vectors.dense(15, 2, 3, 4), 1., 5.)); + Table inputTable = tEnv.fromDataStream(inputStream).as("features", "label", "weight"); + + // Creates a LogisticRegression object and initializes its parameters. + LogisticRegression lr = new LogisticRegression().setWeightCol("weight"); + + // Trains the LogisticRegression Model. + LogisticRegressionModel lrModel = lr.fit(inputTable); + + // Uses the LogisticRegression Model for predictions. + Table outputTable = lrModel.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector features = (DenseVector) row.getField(lr.getFeaturesCol()); + double expectedResult = (Double) row.getField(lr.getLabelCol()); + double predictionResult = (Double) row.getField(lr.getPredictionCol()); + DenseVector rawPredictionResult = (DenseVector) row.getField(lr.getRawPredictionCol()); + System.out.printf( + "Features: %-25s \tExpected Result: %s \tPrediction Result: %s \tRaw Prediction Result: %s\n", + features, expectedResult, predictionResult, rawPredictionResult); + } + } +} + +``` +{{< /tab>}} + +{{< tab "Python">}} +```python +# Simple program that trains a LogisticRegression model and uses it for +# classification. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.classification.logisticregression import LogisticRegression +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([1, 2, 3, 4]), 0., 1.), + (Vectors.dense([2, 2, 3, 4]), 0., 2.), + (Vectors.dense([3, 2, 3, 4]), 0., 3.), + (Vectors.dense([4, 2, 3, 4]), 0., 4.), + (Vectors.dense([5, 2, 3, 4]), 0., 5.), + (Vectors.dense([11, 2, 3, 4]), 1., 1.), + (Vectors.dense([12, 2, 3, 4]), 1., 2.), + (Vectors.dense([13, 2, 3, 4]), 1., 3.), + (Vectors.dense([14, 2, 3, 4]), 1., 4.), + (Vectors.dense([15, 2, 3, 4]), 1., 5.), + ], + type_info=Types.ROW_NAMED( + ['features', 'label', 'weight'], + [DenseVectorTypeInfo(), Types.DOUBLE(), Types.DOUBLE()]) + )) + +# create a logistic regression object and initialize its parameters +logistic_regression = LogisticRegression().set_weight_col('weight') + +# train the logistic regression model +model = logistic_regression.fit(input_data) + +# use the logistic regression model for predictions +output = model.transform(input_data)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + features = result[field_names.index(logistic_regression.get_features_col())] + expected_result = result[field_names.index(logistic_regression.get_label_col())] + prediction_result = result[field_names.index(logistic_regression.get_prediction_col())] + raw_prediction_result = result[field_names.index(logistic_regression.get_raw_prediction_col())] + print('Features: ' + str(features) + ' \tExpected Result: ' + str(expected_result) + + ' \tPrediction Result: ' + str(prediction_result) + + ' \tRaw Prediction Result: ' + str(raw_prediction_result)) + +``` +{{< /tab>}} + +{{< /tabs>}} + +## OnlineLogisticRegression + +Online Logistic Regression supports training online regression model on an +unbounded stream of training data. + +The online optimizer of this algorithm is The FTRL-Proximal proposed by +H.Brendan McMahan et al. See [H. Brendan McMahan et al., Ad click prediction: a +view from the trenches.](https://doi.org/10.1145/2487575.2488200) + +### Input Columns + +| Param name | Type | Default | Description | +| :---------- | :------ | :----------- | :--------------- | +| featuresCol | Vector | `"features"` | Feature vector | +| labelCol | Integer | `"label"` | Label to predict | +| weightCol | Double | `"weight"` | Weight of sample | + +### Output Columns + +| Param name | Type | Default | Description | +| :--------------- | :------ | :---------------- | :----------------------------------------------------- | +| predictionCol | Integer | `"prediction"` | Label of the max probability | +| rawPredictionCol | Vector | `"rawPrediction"` | Vector of the probability of each label | +| modelVersionCol | Long | `"modelVersion"` | The version of the model data used for this prediction | + +### Parameters + +Below are the parameters required by `OnlineLogisticRegressionModel`. + +| Key | Default | Type | Required | Description | +| ---------------- | ----------------- | ------ | -------- | --------------------------- | +| featuresCol | `"features"` | String | no | Features column name. | +| predictionCol | `"prediction"` | String | no | Prediction column name. | +| rawPredictionCol | `"rawPrediction"` | String | no | Raw prediction column name. | +| modelVersionCol | `"modelVersion"` | String | no | Model version column name. | + +`OnlineLogisticRegression` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +| --------------- | ---------------- | ------- | -------- | ----------------------------------------------------- | +| labelCol | `"label"` | String | no | Label column name. | +| weightCol | `null` | String | no | Weight column name. | +| batchStrategy | `COUNT_STRATEGY` | String | no | Strategy to create mini batch from online train data. | +| globalBatchSize | `32` | Integer | no | Global batch size of training algorithms. | +| reg | `0.` | Double | no | Regularization parameter. | +| elasticNet | `0.` | Double | no | ElasticNet parameter. | + +### Examples + +{{< tabs online_examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.ml.classification.logisticregression.OnlineLogisticRegression; +import org.apache.flink.ml.classification.logisticregression.OnlineLogisticRegressionModel; +import org.apache.flink.ml.examples.util.PeriodicSourceFunction; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.ml.linalg.typeinfo.DenseVectorTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** Simple program that trains an OnlineLogisticRegression model and uses it for classification. */ +public class OnlineLogisticRegressionExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(4); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. Both are infinite streams that periodically + // sends out provided data to trigger model update and prediction. + List trainData1 = + Arrays.asList( + Row.of(Vectors.dense(0.1, 2.), 0.), + Row.of(Vectors.dense(0.2, 2.), 0.), + Row.of(Vectors.dense(0.3, 2.), 0.), + Row.of(Vectors.dense(0.4, 2.), 0.), + Row.of(Vectors.dense(0.5, 2.), 0.), + Row.of(Vectors.dense(11., 12.), 1.), + Row.of(Vectors.dense(12., 11.), 1.), + Row.of(Vectors.dense(13., 12.), 1.), + Row.of(Vectors.dense(14., 12.), 1.), + Row.of(Vectors.dense(15., 12.), 1.)); + + List trainData2 = + Arrays.asList( + Row.of(Vectors.dense(0.2, 3.), 0.), + Row.of(Vectors.dense(0.8, 1.), 0.), + Row.of(Vectors.dense(0.7, 1.), 0.), + Row.of(Vectors.dense(0.6, 2.), 0.), + Row.of(Vectors.dense(0.2, 2.), 0.), + Row.of(Vectors.dense(14., 17.), 1.), + Row.of(Vectors.dense(15., 10.), 1.), + Row.of(Vectors.dense(16., 16.), 1.), + Row.of(Vectors.dense(17., 10.), 1.), + Row.of(Vectors.dense(18., 13.), 1.)); + + List predictData = + Arrays.asList( + Row.of(Vectors.dense(0.8, 2.7), 0.0), + Row.of(Vectors.dense(15.5, 11.2), 1.0)); + + RowTypeInfo typeInfo = + new RowTypeInfo( + new TypeInformation[] {DenseVectorTypeInfo.INSTANCE, Types.DOUBLE}, + new String[] {"features", "label"}); + + SourceFunction trainSource = + new PeriodicSourceFunction(1000, Arrays.asList(trainData1, trainData2)); + DataStream trainStream = env.addSource(trainSource, typeInfo); + Table trainTable = tEnv.fromDataStream(trainStream).as("features"); + + SourceFunction predictSource = + new PeriodicSourceFunction(1000, Collections.singletonList(predictData)); + DataStream predictStream = env.addSource(predictSource, typeInfo); + Table predictTable = tEnv.fromDataStream(predictStream).as("features"); + + // Creates an online LogisticRegression object and initializes its parameters and initial + // model data. + Row initModelData = Row.of(Vectors.dense(0.41233679404769874, -0.18088118293232122), 0L); + Table initModelDataTable = tEnv.fromDataStream(env.fromElements(initModelData)); + OnlineLogisticRegression olr = + new OnlineLogisticRegression() + .setFeaturesCol("features") + .setLabelCol("label") + .setPredictionCol("prediction") + .setReg(0.2) + .setElasticNet(0.5) + .setGlobalBatchSize(10) + .setInitialModelData(initModelDataTable); + + // Trains the online LogisticRegression Model. + OnlineLogisticRegressionModel onlineModel = olr.fit(trainTable); + + // Uses the online LogisticRegression Model for predictions. + Table outputTable = onlineModel.transform(predictTable)[0]; + + // Extracts and displays the results. As training data stream continuously triggers the + // update of the internal model data, raw prediction results of the same predict dataset + // would change over time. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector features = (DenseVector) row.getField(olr.getFeaturesCol()); + Double expectedResult = (Double) row.getField(olr.getLabelCol()); + Double predictionResult = (Double) row.getField(olr.getPredictionCol()); + DenseVector rawPredictionResult = (DenseVector) row.getField(olr.getRawPredictionCol()); + System.out.printf( + "Features: %-25s \tExpected Result: %s \tPrediction Result: %s \tRaw Prediction Result: %s\n", + features, expectedResult, predictionResult, rawPredictionResult); + } + } +} + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/classification/naivebayes.md b/docs/content.zh/docs/operators/classification/naivebayes.md new file mode 100644 index 000000000..3fe9beb8e --- /dev/null +++ b/docs/content.zh/docs/operators/classification/naivebayes.md @@ -0,0 +1,192 @@ +--- +title: "Naive Bayes" +type: docs +aliases: +- /operators/classification/naivebayes.html +--- + + +## Naive Bayes + +Naive Bayes is a multiclass classifier. Based on Bayes’ theorem, it assumes that +there is strong (naive) independence between every pair of features. + +### Input Columns + +| Param name | Type | Default | Description | +| :---------- | :------ | :----------- |:------------------| +| featuresCol | Vector | `"features"` | Feature vector. | +| labelCol | Integer | `"label"` | Label to predict. | + +### Output Columns + +| Param name | Type | Default | Description | +| :------------ | :------ | :------------- |:-----------------| +| predictionCol | Integer | `"prediction"` | Predicted label. | + +### Parameters + +Below are parameters required by `NaiveBayesModel`. + +| Key | Default | Type | Required | Description | +| ------------- | --------------- | ------ | -------- |--------------------------------------------------| +| modelType | `"multinomial"` | String | no | The model type. Supported values: "multinomial". | +| featuresCol | `"features"` | String | no | Features column name. | +| predictionCol | `"prediction"` | String | no | Prediction column name. | + +`NaiveBayes` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +| --------- | --------- | ------ | -------- | ------------------------ | +| labelCol | `"label"` | String | no | Label column name. | +| smoothing | `1.0` | Double | no | The smoothing parameter. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} +```java +import org.apache.flink.ml.classification.naivebayes.NaiveBayes; +import org.apache.flink.ml.classification.naivebayes.NaiveBayesModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a NaiveBayes model and uses it for classification. */ +public class NaiveBayesExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements( + Row.of(Vectors.dense(0, 0.), 11), + Row.of(Vectors.dense(1, 0), 10), + Row.of(Vectors.dense(1, 1.), 10)); + Table trainTable = tEnv.fromDataStream(trainStream).as("features", "label"); + + DataStream predictStream = + env.fromElements( + Row.of(Vectors.dense(0, 1.)), + Row.of(Vectors.dense(0, 0.)), + Row.of(Vectors.dense(1, 0)), + Row.of(Vectors.dense(1, 1.))); + Table predictTable = tEnv.fromDataStream(predictStream).as("features"); + + // Creates a NaiveBayes object and initializes its parameters. + NaiveBayes naiveBayes = + new NaiveBayes() + .setSmoothing(1.0) + .setFeaturesCol("features") + .setLabelCol("label") + .setPredictionCol("prediction") + .setModelType("multinomial"); + + // Trains the NaiveBayes Model. + NaiveBayesModel naiveBayesModel = naiveBayes.fit(trainTable); + + // Uses the NaiveBayes Model for predictions. + Table outputTable = naiveBayesModel.transform(predictTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector features = (DenseVector) row.getField(naiveBayes.getFeaturesCol()); + double predictionResult = (Double) row.getField(naiveBayes.getPredictionCol()); + System.out.printf("Features: %s \tPrediction Result: %s\n", features, predictionResult); + } + } +} + +``` +{{< /tab>}} + + +{{< tab "Python">}} +```python + +# Simple program that trains a NaiveBayes model and uses it for classification. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.classification.naivebayes import NaiveBayes +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input training and prediction data +train_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([0, 0.]), 11.), + (Vectors.dense([1, 0]), 10.), + (Vectors.dense([1, 1.]), 10.), + ], + type_info=Types.ROW_NAMED( + ['features', 'label'], + [DenseVectorTypeInfo(), Types.DOUBLE()]))) + +predict_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([0, 1.]),), + (Vectors.dense([0, 0.]),), + (Vectors.dense([1, 0]),), + (Vectors.dense([1, 1.]),), + ], + type_info=Types.ROW_NAMED( + ['features'], + [DenseVectorTypeInfo()]))) + +# create a naive bayes object and initialize its parameters +naive_bayes = NaiveBayes() \ + .set_smoothing(1.0) \ + .set_features_col('features') \ + .set_label_col('label') \ + .set_prediction_col('prediction') \ + .set_model_type('multinomial') + +# train the naive bayes model +model = naive_bayes.fit(train_table) + +# use the naive bayes model for predictions +output = model.transform(predict_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + features = result[field_names.index(naive_bayes.get_features_col())] + prediction_result = result[field_names.index(naive_bayes.get_prediction_col())] + print('Features: ' + str(features) + ' \tPrediction Result: ' + str(prediction_result)) + +``` +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/clustering/_index.md b/docs/content.zh/docs/operators/clustering/_index.md new file mode 100644 index 000000000..86f8a7e87 --- /dev/null +++ b/docs/content.zh/docs/operators/clustering/_index.md @@ -0,0 +1,25 @@ +--- +title: Clustering +bookCollapseSection: true +weight: 1 +aliases: + - /operators/clustering/ +--- + diff --git a/docs/content.zh/docs/operators/clustering/agglomerativeclustering.md b/docs/content.zh/docs/operators/clustering/agglomerativeclustering.md new file mode 100644 index 000000000..9ded65cca --- /dev/null +++ b/docs/content.zh/docs/operators/clustering/agglomerativeclustering.md @@ -0,0 +1,181 @@ +--- +title: "AgglomerativeClustering" +type: docs +aliases: +- /operators/clustering/agglomerativeclustering.html +--- + + +## AgglomerativeClustering + +AgglomerativeClustering performs a hierarchical clustering +using a bottom-up approach. Each observation starts in its +own cluster and the clusters are merged together one by one. + +The output contains two tables. The first one assigns one +cluster Id for each data point. The second one contains the +information of merging two clusters at each step. The data +format of the merging information is +(clusterId1, clusterId2, distance, sizeOfMergedCluster). + +### Input Columns + +| Param name | Type | Default | Description | +|:------------|:-------|:-------------|:----------------| +| featuresCol | Vector | `"features"` | Feature vector. | + +### Output Columns + +| Param name | Type | Default | Description | +|:--------------|:--------|:---------------|:--------------------------| +| predictionCol | Integer | `"prediction"` | Predicted cluster center. | + +### Parameters + +| Key | Default | Type | Required | Description | +|:------------------|:------------------------------|:--------|:---------|:-------------------------------------------------------------------------------| +| numClusters | `2` | Integer | no | The max number of clusters to create. | +| distanceThreshold | `null` | Double | no | Threshold to decide whether two clusters should be merged. | +| linkage | `"ward"` | String | no | Criterion for computing distance between two clusters. | +| computeFullTree | `false` | Boolean | no | Whether computes the full tree after convergence. | +| distanceMeasure | `"euclidean"` | String | no | Distance measure. | +| featuresCol | `"features"` | String | no | Features column name. | +| predictionCol | `"prediction"` | String | no | Prediction column name. | +| windows | `GlobalWindows.getInstance()` | Windows | no | Windowing strategy that determines how to create mini-batches from input data. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} +```java +import org.apache.flink.ml.clustering.agglomerativeclustering.AgglomerativeClustering; +import org.apache.flink.ml.clustering.agglomerativeclustering.AgglomerativeClusteringParams; +import org.apache.flink.ml.common.distance.EuclideanDistanceMeasure; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that creates an AgglomerativeClustering instance and uses it for clustering. */ +public class AgglomerativeClusteringExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Vectors.dense(1, 1), + Vectors.dense(1, 4), + Vectors.dense(1, 0), + Vectors.dense(4, 1.5), + Vectors.dense(4, 4), + Vectors.dense(4, 0)); + Table inputTable = tEnv.fromDataStream(inputStream).as("features"); + + // Creates an AgglomerativeClustering object and initializes its parameters. + AgglomerativeClustering agglomerativeClustering = + new AgglomerativeClustering() + .setLinkage(AgglomerativeClusteringParams.LINKAGE_WARD) + .setDistanceMeasure(EuclideanDistanceMeasure.NAME) + .setPredictionCol("prediction"); + + // Uses the AgglomerativeClustering object for clustering. + Table[] outputs = agglomerativeClustering.transform(inputTable); + + // Extracts and displays the results. + for (CloseableIterator it = outputs[0].execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector features = + (DenseVector) row.getField(agglomerativeClustering.getFeaturesCol()); + int clusterId = (Integer) row.getField(agglomerativeClustering.getPredictionCol()); + System.out.printf("Features: %s \tCluster ID: %s\n", features, clusterId); + } + } +} + +``` +{{< /tab>}} + +{{< tab "Python">}} +```python +# Simple program that creates an agglomerativeclustering instance and uses it for clustering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.clustering.agglomerativeclustering import AgglomerativeClustering +from pyflink.table import StreamTableEnvironment +from matplotlib import pyplot as plt +from scipy.cluster.hierarchy import dendrogram + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates input data. +input_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([1, 1]),), + (Vectors.dense([1, 4]),), + (Vectors.dense([1, 0]),), + (Vectors.dense([4, 1.5]),), + (Vectors.dense([4, 4]),), + (Vectors.dense([4, 0]),), + ], + type_info=Types.ROW_NAMED( + ['features'], + [DenseVectorTypeInfo()]))) + +# Creates an AgglomerativeClustering object and initializes its parameters. +agglomerative_clustering = AgglomerativeClustering() \ + .set_linkage('ward') \ + .set_distance_measure('euclidean') \ + .set_prediction_col('prediction') + +# Uses the AgglomerativeClustering for clustering. +outputs = agglomerative_clustering.transform(input_data) + +# Extracts and display the clustering results. +field_names = outputs[0].get_schema().get_field_names() +for result in t_env.to_data_stream(outputs[0]).execute_and_collect(): + features = result[field_names.index(agglomerative_clustering.features_col)] + cluster_id = result[field_names.index(agglomerative_clustering.prediction_col)] + print('Features: ' + str(features) + '\tCluster ID: ' + str(cluster_id)) + +# Visualizes the merge info. +merge_info = [result for result in + t_env.to_data_stream(outputs[1]).execute_and_collect()] +plt.title("Agglomerative Clustering Dendrogram") +dendrogram(merge_info) +plt.xlabel("Index of data point.") +plt.ylabel("Distances between merged clusters.") +plt.show() +``` +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/clustering/kmeans.md b/docs/content.zh/docs/operators/clustering/kmeans.md new file mode 100644 index 000000000..eeeff603a --- /dev/null +++ b/docs/content.zh/docs/operators/clustering/kmeans.md @@ -0,0 +1,329 @@ +--- +title: "Kmeans" +type: docs +aliases: +- /operators/clustering/kmeans.html +--- + + +## K-means + +K-means is a commonly-used clustering algorithm. It groups given data points +into a predefined number of clusters. + +### Input Columns + +| Param name | Type | Default | Description | +|:------------|:-------|:-------------|:----------------| +| featuresCol | Vector | `"features"` | Feature vector. | + +### Output Columns + +| Param name | Type | Default | Description | +|:--------------|:--------|:---------------|:--------------------------| +| predictionCol | Integer | `"prediction"` | Predicted cluster center. | + +### Parameters + +Below are the parameters required by `KMeansModel`. + +| Key | Default | Type | Required | Description | +|-----------------|----------------|---------|----------|---------------------------------------------------------------------------| +| distanceMeasure | `euclidean` | String | no | Distance measure. Supported values: `'euclidean', 'manhattan', 'cosine'`. | +| featuresCol | `"features"` | String | no | Features column name. | +| predictionCol | `"prediction"` | String | no | Prediction column name. | +| k | `2` | Integer | no | The max number of clusters to create. | + +`KMeans` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|----------|------------|---------|----------|------------------------------------------------------------| +| initMode | `"random"` | String | no | The initialization algorithm. Supported options: 'random'. | +| seed | `null` | Long | no | The random seed. | +| maxIter | `20` | Integer | no | Maximum number of iterations. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} +```java +import org.apache.flink.ml.clustering.kmeans.KMeans; +import org.apache.flink.ml.clustering.kmeans.KMeansModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a KMeans model and uses it for clustering. */ +public class KMeansExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Vectors.dense(0.0, 0.0), + Vectors.dense(0.0, 0.3), + Vectors.dense(0.3, 0.0), + Vectors.dense(9.0, 0.0), + Vectors.dense(9.0, 0.6), + Vectors.dense(9.6, 0.0)); + Table inputTable = tEnv.fromDataStream(inputStream).as("features"); + + // Creates a K-means object and initializes its parameters. + KMeans kmeans = new KMeans().setK(2).setSeed(1L); + + // Trains the K-means Model. + KMeansModel kmeansModel = kmeans.fit(inputTable); + + // Uses the K-means Model for predictions. + Table outputTable = kmeansModel.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector features = (DenseVector) row.getField(kmeans.getFeaturesCol()); + int clusterId = (Integer) row.getField(kmeans.getPredictionCol()); + System.out.printf("Features: %s \tCluster ID: %s\n", features, clusterId); + } + } +} + +``` +{{< /tab>}} + +{{< tab "Python">}} +```python +# Simple program that trains a KMeans model and uses it for clustering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.clustering.kmeans import KMeans +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([0.0, 0.0]),), + (Vectors.dense([0.0, 0.3]),), + (Vectors.dense([0.3, 3.0]),), + (Vectors.dense([9.0, 0.0]),), + (Vectors.dense([9.0, 0.6]),), + (Vectors.dense([9.6, 0.0]),), + ], + type_info=Types.ROW_NAMED( + ['features'], + [DenseVectorTypeInfo()]))) + +# create a kmeans object and initialize its parameters +kmeans = KMeans().set_k(2).set_seed(1) + +# train the kmeans model +model = kmeans.fit(input_data) + +# use the kmeans model for predictions +output = model.transform(input_data)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + features = result[field_names.index(kmeans.get_features_col())] + cluster_id = result[field_names.index(kmeans.get_prediction_col())] + print('Features: ' + str(features) + ' \tCluster Id: ' + str(cluster_id)) + +``` +{{< /tab>}} + +{{< /tabs>}} + +## Online K-means + +Online K-Means extends the function of K-Means, supporting to train a K-Means +model continuously according to an unbounded stream of train data. + +Online K-Means makes updates with the "mini-batch" K-Means rule, generalized to +incorporate forgetfulness (i.e. decay). After the centroids estimated on the +current batch are acquired, Online K-Means computes the new centroids from the +weighted average between the original and the estimated centroids. The weight of +the estimated centroids is the number of points assigned to them. The weight of +the original centroids is also the number of points, but additionally +multiplying with the decay factor. + +The decay factor scales the contribution of the clusters as estimated thus far. +If the decay factor is 1, all batches are weighted equally. If the decay factor +is 0, new centroids are determined entirely by recent data. Lower values +correspond to more forgetting. + +### Input Columns + +| Param name | Type | Default | Description | +|:------------|:-------|:-------------|:---------------| +| featuresCol | Vector | `"features"` | Feature vector | + +### Output Columns + +| Param name | Type | Default | Description | +|:--------------|:--------|:---------------|:-------------------------| +| predictionCol | Integer | `"prediction"` | Predicted cluster center | + +### Parameters + +Below are the parameters required by `OnlineKMeansModel`. + +| Key | Default | Type | Required | Description | +|-----------------|----------------|---------|----------|---------------------------------------------------------------------------| +| distanceMeasure | `euclidean` | String | no | Distance measure. Supported values: `'euclidean', 'manhattan', 'cosine'`. | +| featuresCol | `"features"` | String | no | Features column name. | +| predictionCol | `"prediction"` | String | no | Prediction column name. | +| k | `2` | Integer | no | The max number of clusters to create. | + +`OnlineKMeans` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|-----------------|------------------|---------|----------|-------------------------------------------------------| +| batchStrategy | `COUNT_STRATEGY` | String | no | Strategy to create mini batch from online train data. | +| globalBatchSize | `32` | Integer | no | Global batch size of training algorithms. | +| decayFactor | `0.` | Double | no | The forgetfulness of the previous centroids. | +| seed | null | Long | no | The random seed. | + +### Examples + +{{< tabs online_examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.ml.clustering.kmeans.KMeansModelData; +import org.apache.flink.ml.clustering.kmeans.OnlineKMeans; +import org.apache.flink.ml.clustering.kmeans.OnlineKMeansModel; +import org.apache.flink.ml.examples.util.PeriodicSourceFunction; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.ml.linalg.typeinfo.DenseVectorTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +/** Simple program that trains an OnlineKMeans model and uses it for clustering. */ +public class OnlineKMeansExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(4); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. Both are infinite streams that periodically + // sends out provided data to trigger model update and prediction. + List trainData1 = + Arrays.asList( + Row.of(Vectors.dense(0.0, 0.0)), + Row.of(Vectors.dense(0.0, 0.3)), + Row.of(Vectors.dense(0.3, 0.0)), + Row.of(Vectors.dense(9.0, 0.0)), + Row.of(Vectors.dense(9.0, 0.6)), + Row.of(Vectors.dense(9.6, 0.0))); + + List trainData2 = + Arrays.asList( + Row.of(Vectors.dense(10.0, 100.0)), + Row.of(Vectors.dense(10.0, 100.3)), + Row.of(Vectors.dense(10.3, 100.0)), + Row.of(Vectors.dense(-10.0, -100.0)), + Row.of(Vectors.dense(-10.0, -100.6)), + Row.of(Vectors.dense(-10.6, -100.0))); + + List predictData = + Arrays.asList( + Row.of(Vectors.dense(10.0, 10.0)), Row.of(Vectors.dense(-10.0, 10.0))); + + SourceFunction trainSource = + new PeriodicSourceFunction(1000, Arrays.asList(trainData1, trainData2)); + DataStream trainStream = + env.addSource(trainSource, new RowTypeInfo(DenseVectorTypeInfo.INSTANCE)); + Table trainTable = tEnv.fromDataStream(trainStream).as("features"); + + SourceFunction predictSource = + new PeriodicSourceFunction(1000, Collections.singletonList(predictData)); + DataStream predictStream = + env.addSource(predictSource, new RowTypeInfo(DenseVectorTypeInfo.INSTANCE)); + Table predictTable = tEnv.fromDataStream(predictStream).as("features"); + + // Creates an online K-means object and initializes its parameters and initial model data. + OnlineKMeans onlineKMeans = + new OnlineKMeans() + .setFeaturesCol("features") + .setPredictionCol("prediction") + .setGlobalBatchSize(6) + .setInitialModelData( + KMeansModelData.generateRandomModelData(tEnv, 2, 2, 0.0, 0)); + + // Trains the online K-means Model. + OnlineKMeansModel onlineModel = onlineKMeans.fit(trainTable); + + // Uses the online K-means Model for predictions. + Table outputTable = onlineModel.transform(predictTable)[0]; + + // Extracts and displays the results. As training data stream continuously triggers the + // update of the internal k-means model data, clustering results of the same predict dataset + // would change over time. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row1 = it.next(); + DenseVector features1 = (DenseVector) row1.getField(onlineKMeans.getFeaturesCol()); + Integer clusterId1 = (Integer) row1.getField(onlineKMeans.getPredictionCol()); + Row row2 = it.next(); + DenseVector features2 = (DenseVector) row2.getField(onlineKMeans.getFeaturesCol()); + Integer clusterId2 = (Integer) row2.getField(onlineKMeans.getPredictionCol()); + if (Objects.equals(clusterId1, clusterId2)) { + System.out.printf("%s and %s are now in the same cluster.\n", features1, features2); + } else { + System.out.printf( + "%s and %s are now in different clusters.\n", features1, features2); + } + } + } +} + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/evaluation/_index.md b/docs/content.zh/docs/operators/evaluation/_index.md new file mode 100644 index 000000000..355e5577e --- /dev/null +++ b/docs/content.zh/docs/operators/evaluation/_index.md @@ -0,0 +1,25 @@ +--- +title: Evaluation +bookCollapseSection: true +weight: 1 +aliases: + - /operators/evaluation/ +--- + diff --git a/docs/content.zh/docs/operators/evaluation/binaryclassificationevaluator.md b/docs/content.zh/docs/operators/evaluation/binaryclassificationevaluator.md new file mode 100644 index 000000000..6e2d1a9ee --- /dev/null +++ b/docs/content.zh/docs/operators/evaluation/binaryclassificationevaluator.md @@ -0,0 +1,192 @@ +--- +title: "Binary Classification Evaluator" +weight: 1 +type: docs +aliases: +- /operators/evaluation/binaryclassificationevaluator.html +--- + + + +## Binary Classification Evaluator + +Binary Classification Evaluator calculates the evaluation metrics for binary +classification. The input data has `rawPrediction`, `label`, and an optional +weight column. The `rawPrediction` can be of type double (binary 0/1 prediction, +or probability of label 1) or of type vector (length-2 vector of raw +predictions, scores, or label probabilities). The output may contain different +metrics defined by the parameter `MetricsNames`. +### Input Columns + +| Param name | Type | Default | Description | +| :--------------- | :------------ | :-------------- |:---------------------------| +| labelCol | Number | `"label"` | The label of this entry. | +| rawPredictionCol | Vector/Number | `rawPrediction` | The raw prediction result. | +| weightCol | Number | `null` | The weight of this entry. | + +### Output Columns + +| Column name | Type | Description | +| ----------------- | ------ |--------------------------------------------------------------------------------------------------| +| "areaUnderROC" | Double | The area under the receiver operating characteristic (ROC) curve. | +| "areaUnderPR" | Double | The area under the precision-recall curve. | +| "areaUnderLorenz" | Double | Kolmogorov-Smirnov, measures the ability of the model to separate positive and negative samples. | +| "ks" | Double | The area under the lorenz curve. | + +### Parameters + +| Key | Default | Type | Required | Description | +|------------------|-----------------------------------|----------|----------|--------------------------------------------------------------------------------------------------------| +| labelCol | `"label"` | String | no | Label column name. | +| weightCol | `null` | String | no | Weight column name. | +| rawPredictionCol | `"rawPrediction"` | String | no | Raw prediction column name. | +| metricsNames | `["areaUnderROC", "areaUnderPR"]` | String[] | no | Names of the output metrics. Supported values: 'areaUnderROC', 'areaUnderPR', 'areaUnderLorenz', 'ks'. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.evaluation.binaryclassification.BinaryClassificationEvaluator; +import org.apache.flink.ml.evaluation.binaryclassification.BinaryClassificationEvaluatorParams; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; + +/** + * Simple program that creates a BinaryClassificationEvaluator instance and uses it for evaluation. + */ +public class BinaryClassificationEvaluatorExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(1.0, Vectors.dense(0.1, 0.9)), + Row.of(1.0, Vectors.dense(0.2, 0.8)), + Row.of(1.0, Vectors.dense(0.3, 0.7)), + Row.of(0.0, Vectors.dense(0.25, 0.75)), + Row.of(0.0, Vectors.dense(0.4, 0.6)), + Row.of(1.0, Vectors.dense(0.35, 0.65)), + Row.of(1.0, Vectors.dense(0.45, 0.55)), + Row.of(0.0, Vectors.dense(0.6, 0.4)), + Row.of(0.0, Vectors.dense(0.7, 0.3)), + Row.of(1.0, Vectors.dense(0.65, 0.35)), + Row.of(0.0, Vectors.dense(0.8, 0.2)), + Row.of(1.0, Vectors.dense(0.9, 0.1))); + Table inputTable = tEnv.fromDataStream(inputStream).as("label", "rawPrediction"); + + // Creates a BinaryClassificationEvaluator object and initializes its parameters. + BinaryClassificationEvaluator evaluator = + new BinaryClassificationEvaluator() + .setMetricsNames( + BinaryClassificationEvaluatorParams.AREA_UNDER_PR, + BinaryClassificationEvaluatorParams.KS, + BinaryClassificationEvaluatorParams.AREA_UNDER_ROC); + + // Uses the BinaryClassificationEvaluator object for evaluations. + Table outputTable = evaluator.transform(inputTable)[0]; + + // Extracts and displays the results. + Row evaluationResult = outputTable.execute().collect().next(); + System.out.printf( + "Area under the precision-recall curve: %s\n", + evaluationResult.getField(BinaryClassificationEvaluatorParams.AREA_UNDER_PR)); + System.out.printf( + "Area under the receiver operating characteristic curve: %s\n", + evaluationResult.getField(BinaryClassificationEvaluatorParams.AREA_UNDER_ROC)); + System.out.printf( + "Kolmogorov-Smirnov value: %s\n", + evaluationResult.getField(BinaryClassificationEvaluatorParams.KS)); + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a BinaryClassificationEvaluator instance and uses +# it for evaluation. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.evaluation.binaryclassification import BinaryClassificationEvaluator +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_table = t_env.from_data_stream( + env.from_collection([ + (1.0, Vectors.dense(0.1, 0.9)), + (1.0, Vectors.dense(0.2, 0.8)), + (1.0, Vectors.dense(0.3, 0.7)), + (0.0, Vectors.dense(0.25, 0.75)), + (0.0, Vectors.dense(0.4, 0.6)), + (1.0, Vectors.dense(0.35, 0.65)), + (1.0, Vectors.dense(0.45, 0.55)), + (0.0, Vectors.dense(0.6, 0.4)), + (0.0, Vectors.dense(0.7, 0.3)), + (1.0, Vectors.dense(0.65, 0.35)), + (0.0, Vectors.dense(0.8, 0.2)), + (1.0, Vectors.dense(0.9, 0.1)) + ], + type_info=Types.ROW_NAMED( + ['label', 'rawPrediction'], + [Types.DOUBLE(), DenseVectorTypeInfo()])) +) + +# create a binary classification evaluator object and initialize its parameters +evaluator = BinaryClassificationEvaluator() \ + .set_metrics_names('areaUnderPR', 'ks', 'areaUnderROC') + +# use the binary classification evaluator model for evaluations +output = evaluator.transform(input_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +result = t_env.to_data_stream(output).execute_and_collect().next() +print('Area under the precision-recall curve: ' + + str(result[field_names.index('areaUnderPR')])) +print('Area under the receiver operating characteristic curve: ' + + str(result[field_names.index('areaUnderROC')])) +print('Kolmogorov-Smirnov value: ' + + str(result[field_names.index('ks')])) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/_index.md b/docs/content.zh/docs/operators/feature/_index.md new file mode 100644 index 000000000..a87ec2326 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/_index.md @@ -0,0 +1,25 @@ +--- +title: Feature Engineering +bookCollapseSection: true +weight: 1 +aliases: + - /operators/feature/ +--- + diff --git a/docs/content.zh/docs/operators/feature/binarizer.md b/docs/content.zh/docs/operators/feature/binarizer.md new file mode 100644 index 000000000..68e700abd --- /dev/null +++ b/docs/content.zh/docs/operators/feature/binarizer.md @@ -0,0 +1,183 @@ +--- +title: "Binarizer" +weight: 1 +type: docs +aliases: +- /operators/feature/binarizer.html +--- + + + +## Binarizer + +Binarizer binarizes the columns of continuous features by the given thresholds. +The continuous features may be DenseVector, SparseVector, or Numerical Value. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:--------------|:--------|:--------------------------------| +| inputCols | Number/Vector | `null` | Number/Vectors to be binarized. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:--------------|:--------|:--------------------------| +| outputCols | Number/Vector | `null` | Binarized Number/Vectors. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-------------|-----------|----------|----------|------------------------------------------------------| +| inputCols | `null` | String[] | yes | Input column names. | +| outputCols | `null` | String[] | yes | Output column name. | +| thresholds | `null` | Double[] | yes | The thresholds used to binarize continuous features. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.binarizer.Binarizer; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that creates a Binarizer instance and uses it for feature engineering. */ +public class BinarizerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of( + 1, + Vectors.dense(1, 2), + Vectors.sparse( + 17, new int[] {0, 3, 9}, new double[] {1.0, 2.0, 7.0})), + Row.of( + 2, + Vectors.dense(2, 1), + Vectors.sparse( + 17, new int[] {0, 2, 14}, new double[] {5.0, 4.0, 1.0})), + Row.of( + 3, + Vectors.dense(5, 18), + Vectors.sparse( + 17, new int[] {0, 11, 12}, new double[] {2.0, 4.0, 4.0}))); + + Table inputTable = tEnv.fromDataStream(inputStream).as("f0", "f1", "f2"); + + // Creates a Binarizer object and initializes its parameters. + Binarizer binarizer = + new Binarizer() + .setInputCols("f0", "f1", "f2") + .setOutputCols("of0", "of1", "of2") + .setThresholds(0.0, 0.0, 0.0); + + // Transforms input data. + Table outputTable = binarizer.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + Object[] inputValues = new Object[binarizer.getInputCols().length]; + Object[] outputValues = new Object[binarizer.getInputCols().length]; + for (int i = 0; i < inputValues.length; i++) { + inputValues[i] = row.getField(binarizer.getInputCols()[i]); + outputValues[i] = row.getField(binarizer.getOutputCols()[i]); + } + + System.out.printf( + "Input Values: %s\tOutput Values: %s\n", + Arrays.toString(inputValues), Arrays.toString(outputValues)); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a Binarizer instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.binarizer import Binarizer +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data_table = t_env.from_data_stream( + env.from_collection([ + (1, + Vectors.dense(3, 4)), + (2, + Vectors.dense(6, 2)) + ], + type_info=Types.ROW_NAMED( + ['f0', 'f1'], + [Types.INT(), DenseVectorTypeInfo()]))) + +# create an binarizer object and initialize its parameters +binarizer = Binarizer() \ + .set_input_cols('f0', 'f1') \ + .set_output_cols('of0', 'of1') \ + .set_thresholds(1.5, 3.5) + +# use the binarizer for feature engineering +output = binarizer.transform(input_data_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +input_values = [None for _ in binarizer.get_input_cols()] +output_values = [None for _ in binarizer.get_output_cols()] +for result in t_env.to_data_stream(output).execute_and_collect(): + for i in range(len(binarizer.get_input_cols())): + input_values[i] = result[field_names.index(binarizer.get_input_cols()[i])] + output_values[i] = result[field_names.index(binarizer.get_output_cols()[i])] + print('Input Values: ' + str(input_values) + '\tOutput Values: ' + str(output_values)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/bucketizer.md b/docs/content.zh/docs/operators/feature/bucketizer.md new file mode 100644 index 000000000..c19abfa80 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/bucketizer.md @@ -0,0 +1,179 @@ +--- +title: "Bucketizer" +weight: 1 +type: docs +aliases: +- /operators/feature/bucketizer.html +--- + + + +## Bucketizer + +Bucketizer is an algorithm that maps multiple columns of continuous features to +multiple columns of discrete features, i.e., buckets indices. The indices are in +[0, numSplitsInThisColumn - 1]. +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:--------|:--------------------------------------| +| inputCols | Number | `null` | Continuous features to be bucketized. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:--------|:----------------------| +| outputCols | Double | `null` | Discretized features. | + +### Parameters + +| Key | Default | Type | Required | Description | +|---------------|-----------|-------------|----------|--------------------------------------------------------------------------------| +| inputCols | `null` | String[] | yes | Input column names. | +| outputCols | `null` | String[] | yes | Output column names. | +| handleInvalid | `"error"` | String | no | Strategy to handle invalid entries. Supported values: 'error', 'skip', 'keep'. | +| splitsArray | `null` | Double\[][] | yes | Array of split points for mapping continuous features into buckets. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.common.param.HasHandleInvalid; +import org.apache.flink.ml.feature.bucketizer.Bucketizer; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that creates a Bucketizer instance and uses it for feature engineering. */ +public class BucketizerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = env.fromElements(Row.of(-0.5, 0.0, 1.0, 0.0)); + Table inputTable = tEnv.fromDataStream(inputStream).as("f1", "f2", "f3", "f4"); + + // Creates a Bucketizer object and initializes its parameters. + Double[][] splitsArray = + new Double[][] { + new Double[] {-0.5, 0.0, 0.5}, + new Double[] {-1.0, 0.0, 2.0}, + new Double[] {Double.NEGATIVE_INFINITY, 10.0, Double.POSITIVE_INFINITY}, + new Double[] {Double.NEGATIVE_INFINITY, 1.5, Double.POSITIVE_INFINITY} + }; + Bucketizer bucketizer = + new Bucketizer() + .setInputCols("f1", "f2", "f3", "f4") + .setOutputCols("o1", "o2", "o3", "o4") + .setSplitsArray(splitsArray) + .setHandleInvalid(HasHandleInvalid.SKIP_INVALID); + + // Uses the Bucketizer object for feature transformations. + Table outputTable = bucketizer.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + double[] inputValues = new double[bucketizer.getInputCols().length]; + double[] outputValues = new double[bucketizer.getInputCols().length]; + for (int i = 0; i < inputValues.length; i++) { + inputValues[i] = (double) row.getField(bucketizer.getInputCols()[i]); + outputValues[i] = (double) row.getField(bucketizer.getOutputCols()[i]); + } + + System.out.printf( + "Input Values: %s\tOutput Values: %s\n", + Arrays.toString(inputValues), Arrays.toString(outputValues)); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a Bucketizer instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.bucketizer import Bucketizer +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data = t_env.from_data_stream( + env.from_collection([ + (-0.5, 0.0, 1.0, 0.0), + ], + type_info=Types.ROW_NAMED( + ['f1', 'f2', 'f3', 'f4'], + [Types.DOUBLE(), Types.DOUBLE(), Types.DOUBLE(), Types.DOUBLE()]) + )) + +# create a bucketizer object and initialize its parameters +splits_array = [ + [-0.5, 0.0, 0.5], + [-1.0, 0.0, 2.0], + [float('-inf'), 10.0, float('inf')], + [float('-inf'), 1.5, float('inf')], +] + +bucketizer = Bucketizer() \ + .set_input_cols('f1', 'f2', 'f3', 'f4') \ + .set_output_cols('o1', 'o2', 'o3', 'o4') \ + .set_splits_array(splits_array) + +# use the bucketizer model for feature engineering +output = bucketizer.transform(input_data)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +input_values = [None for _ in bucketizer.get_input_cols()] +output_values = [None for _ in bucketizer.get_input_cols()] +for result in t_env.to_data_stream(output).execute_and_collect(): + for i in range(len(bucketizer.get_input_cols())): + input_values[i] = result[field_names.index(bucketizer.get_input_cols()[i])] + output_values[i] = result[field_names.index(bucketizer.get_output_cols()[i])] + print('Input Values: ' + str(input_values) + '\tOutput Values: ' + str(output_values)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/countvectorizer.md b/docs/content.zh/docs/operators/feature/countvectorizer.md new file mode 100644 index 000000000..b6658c06c --- /dev/null +++ b/docs/content.zh/docs/operators/feature/countvectorizer.md @@ -0,0 +1,182 @@ +--- +title: "CountVectorizer" +weight: 1 +type: docs +aliases: +- /operators/feature/countvectorizer.html +--- + + + +## CountVectorizer + +CountVectorizer is an algorithm that converts a collection of text +documents to vectors of token counts. When an a-priori dictionary is not +available, CountVectorizer can be used as an estimator to extract the +vocabulary, and generates a CountVectorizerModel. The model produces sparse +representations for the documents over the vocabulary, which can then be +passed to other algorithms like LDA. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:---------|:----------|:--------------------| +| inputCol | String[] | `"input"` | Input string array. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------------|:-----------|:------------------------| +| outputCol | SparseVector | `"output"` | Vector of token counts. | + +### Parameters + +Below are the parameters required by `CountVectorizerModel`. + +| Key | Default | Type | Required | Description | +|------------|------------|---------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| minTF | `1.0` | Double | no | Filter to ignore rare words in a document. For each document, terms with frequency/count less than the given threshold are ignored. If this is an integer >= 1, then this specifies a count (of times the term must appear in the document); if this is a double in [0,1), then this specifies a fraction (out of the document's token count). | +| binary | `false` | Boolean | no | Binary toggle to control the output vector values. If True, all nonzero counts (after minTF filter applied) are set to 1.0. | + +`CountVectorizer` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|:---------------|:-----------|:---------|:---------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| vocabularySize | `2^18` | Integer | no | Max size of the vocabulary. CountVectorizer will build a vocabulary that only considers the top vocabulary size terms ordered by term frequency across the corpus. | +| minDF | `1.0` | Double | no | Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. | +| maxDF | `2^63 - 1` | Double | no | Specifies the maximum number of different documents a term could appear in to be included in the vocabulary. A term that appears more than the threshold will be ignored. If this is an integer >= 1, this specifies the maximum number of documents the term could appear in; if this is a double in [0,1), then this specifies the maximum fraction of documents the term could appear in. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.countvectorizer.CountVectorizer; +import org.apache.flink.ml.feature.countvectorizer.CountVectorizerModel; +import org.apache.flink.ml.linalg.SparseVector; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** + * Simple program that trains a {@link CountVectorizer} model and uses it for feature engineering. + */ +public class CountVectorizerExample { + + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream dataStream = + env.fromElements( + Row.of((Object) new String[] {"a", "c", "b", "c"}), + Row.of((Object) new String[] {"c", "d", "e"}), + Row.of((Object) new String[] {"a", "b", "c"}), + Row.of((Object) new String[] {"e", "f"}), + Row.of((Object) new String[] {"a", "c", "a"})); + Table inputTable = tEnv.fromDataStream(dataStream).as("input"); + + // Creates an CountVectorizer object and initialize its parameters + CountVectorizer countVectorizer = new CountVectorizer(); + + // Trains the CountVectorizer model + CountVectorizerModel model = countVectorizer.fit(inputTable); + + // Uses the CountVectorizer model for predictions. + Table outputTable = model.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + String[] inputValue = (String[]) row.getField(countVectorizer.getInputCol()); + SparseVector outputValue = (SparseVector) row.getField(countVectorizer.getOutputCol()); + System.out.printf( + "Input Value: %-15s \tOutput Value: %s\n", + Arrays.toString(inputValue), outputValue.toString()); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python + +# Simple program that creates an CountVectorizer instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.countvectorizer import CountVectorizer +from pyflink.table import StreamTableEnvironment + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates input training and prediction data. +input_table = t_env.from_data_stream( + env.from_collection([ + (1, ['a', 'c', 'b', 'c'],), + (2, ['c', 'd', 'e'],), + (3, ['a', 'b', 'c'],), + (4, ['e', 'f'],), + (5, ['a', 'c', 'a'],), + ], + type_info=Types.ROW_NAMED( + ['id', 'input', ], + [Types.INT(), Types.OBJECT_ARRAY(Types.STRING())]) + )) + +# Creates an CountVectorizer object and initializes its parameters. +count_vectorizer = CountVectorizer() + +# Trains the CountVectorizer Model. +model = count_vectorizer.fit(input_table) + +# Uses the CountVectorizer Model for predictions. +output = model.transform(input_table)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_index = field_names.index(count_vectorizer.get_input_col()) + output_index = field_names.index(count_vectorizer.get_output_col()) + print('Input Value: %-20s Output Value: %10s' % + (str(result[input_index]), str(result[output_index]))) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/dct.md b/docs/content.zh/docs/operators/feature/dct.md new file mode 100644 index 000000000..356260be5 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/dct.md @@ -0,0 +1,151 @@ +--- +title: "DCT" +weight: 1 +type: docs +aliases: +- /operators/feature/dct.html +--- + + + +## DCT + +DCT is a Transformer that takes the 1D discrete cosine transform of a real +vector. No zero padding is performed on the input vector. It returns a real +vector of the same length representing the DCT. The return vector is scaled such +that the transform matrix is unitary (aka scaled DCT-II). + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:---------------------------------------| +| inputCol | Vector | `"input"` | Input vector to be cosine transformed. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:----------------------------------| +| outputCol | Vector | `"output"` | Cosine transformed output vector. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-----------|------------|---------|----------|-------------------------------------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| inverse | `false` | Boolean | no | Whether to perform the inverse DCT (true) or forward DCT (false). | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.dct.DCT; +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; +import java.util.List; + +/** Simple program that creates a DCT instance and uses it for feature engineering. */ +public class DCTExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + List inputData = + Arrays.asList( + Vectors.dense(1.0, 1.0, 1.0, 1.0), Vectors.dense(1.0, 0.0, -1.0, 0.0)); + Table inputTable = tEnv.fromDataStream(env.fromCollection(inputData)).as("input"); + + // Creates a DCT object and initializes its parameters. + DCT dct = new DCT(); + + // Uses the DCT object for feature transformations. + Table outputTable = dct.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + Vector inputValue = row.getFieldAs(dct.getInputCol()); + Vector outputValue = row.getFieldAs(dct.getOutputCol()); + + System.out.printf("Input Value: %s\tOutput Value: %s\n", inputValue, outputValue); + } + } +} +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a DCT instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.dct import DCT +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(1.0, 1.0, 1.0, 1.0),), + (Vectors.dense(1.0, 0.0, -1.0, 0.0),), + ], + type_info=Types.ROW_NAMED( + ['input'], + [DenseVectorTypeInfo()]))) + +# create a DCT object and initialize its parameters +dct = DCT() + +# use the dct for feature engineering +output = dct.transform(input_data)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(dct.get_input_col())] + output_value = result[field_names.index(dct.get_output_col())] + print('Input Value: ' + str(input_value) + '\tOutput Value: ' + str(output_value)) +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/elementwiseproduct.md b/docs/content.zh/docs/operators/feature/elementwiseproduct.md new file mode 100644 index 000000000..0021c15b4 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/elementwiseproduct.md @@ -0,0 +1,157 @@ +--- +title: "ElementwiseProduct" +weight: 1 +type: docs +aliases: +- /operators/feature/elementwiseproduct.html +--- + + + +## ElementwiseProduct + +ElementwiseProduct multiplies each input vector with a given scaling vector using +Hadamard product. If the size of the input vector does not equal the size of the +scaling vector, the transformer will throw an IllegalArgumentException. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-----------------------| +| inputCol | Vector | `"input"` | Features to be scaled. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-----------------| +| outputCol | Vector | `"output"` | Scaled features. | + +### Parameters + +| Key | Default | Type | Required | Description | +|------------|------------|--------|----------|---------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| scalingVec | `null` | String | yes | The scaling vector. | +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.elementwiseproduct.ElementwiseProduct; +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** + * Simple program that creates an ElementwiseProduct instance and uses it for feature engineering. + */ +public class ElementwiseProductExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(0, Vectors.dense(1.1, 3.2)), Row.of(1, Vectors.dense(2.1, 3.1))); + + Table inputTable = tEnv.fromDataStream(inputStream).as("id", "vec"); + + // Creates an ElementwiseProduct object and initializes its parameters. + ElementwiseProduct elementwiseProduct = + new ElementwiseProduct() + .setInputCol("vec") + .setOutputCol("outputVec") + .setScalingVec(Vectors.dense(1.1, 1.1)); + + // Transforms input data. + Table outputTable = elementwiseProduct.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + Vector inputValue = (Vector) row.getField(elementwiseProduct.getInputCol()); + Vector outputValue = (Vector) row.getField(elementwiseProduct.getOutputCol()); + System.out.printf("Input Value: %s \tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates an ElementwiseProduct instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.elementwiseproduct import ElementwiseProduct +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data_table = t_env.from_data_stream( + env.from_collection([ + (1, Vectors.dense(2.1, 3.1)), + (2, Vectors.dense(1.1, 3.3)) + ], + type_info=Types.ROW_NAMED( + ['id', 'vec'], + [Types.INT(), DenseVectorTypeInfo()]))) + +# create an elementwise product object and initialize its parameters +elementwise_product = ElementwiseProduct() \ + .set_input_col('vec') \ + .set_output_col('output_vec') \ + .set_scaling_vec(Vectors.dense(1.1, 1.1)) + +# use the elementwise product object for feature engineering +output = elementwise_product.transform(input_data_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(elementwise_product.get_input_col())] + output_value = result[field_names.index(elementwise_product.get_output_col())] + print('Input Value: ' + str(input_value) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/featurehasher.md b/docs/content.zh/docs/operators/feature/featurehasher.md new file mode 100644 index 000000000..e804d9ae6 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/featurehasher.md @@ -0,0 +1,177 @@ +--- +title: "FeatureHasher" +weight: 1 +type: docs +aliases: +- /operators/feature/featurehasher.html +--- + + + +## FeatureHasher + +FeatureHasher transforms a set of categorical or numerical features into a sparse vector of +a specified dimension. The rules of hashing categorical columns and numerical columns are as +follows: + +
    +
  • For numerical columns, the index of this feature in the output vector is the hash value of + the column name and its correponding value is the same as the input. +
  • For categorical columns, the index of this feature in the output vector is the hash value + of the string "column_name=value" and the corresponding value is 1.0. +
+ +

If multiple features are projected into the same column, the output values are accumulated. +For the hashing trick, see https://en.wikipedia.org/wiki/Feature_hashing for details. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:----------------------|:--------|:----------------------| +| inputCols | Number/String/Boolean | `null` | Columns to be hashed. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:---------------| +| outputCol | Vector | `"output"` | Output vector. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-----------------|------------|-----------|----------|---------------------------| +| inputCols | `null` | String[] | yes | Input column names. | +| outputCol | `"output"` | String | no | Output column name. | +| categoricalCols | `[]` | String[] | no | Categorical column names. | +| numFeatures | `262144` | Integer | no | The number of features. | +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.featurehasher.FeatureHasher; +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that creates a FeatureHasher instance and uses it for feature engineering. */ +public class FeatureHasherExample { + public static void main(String[] args) { + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream dataStream = + env.fromCollection( + Arrays.asList(Row.of(0, "a", 1.0, true), Row.of(1, "c", 1.0, false))); + Table inputDataTable = tEnv.fromDataStream(dataStream).as("id", "f0", "f1", "f2"); + + // Creates a FeatureHasher object and initializes its parameters. + FeatureHasher featureHash = + new FeatureHasher() + .setInputCols("f0", "f1", "f2") + .setCategoricalCols("f0", "f2") + .setOutputCol("vec") + .setNumFeatures(1000); + + // Uses the FeatureHasher object for feature transformations. + Table outputTable = featureHash.transform(inputDataTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + Object[] inputValues = new Object[featureHash.getInputCols().length]; + for (int i = 0; i < inputValues.length; i++) { + inputValues[i] = row.getField(featureHash.getInputCols()[i]); + } + Vector outputValue = (Vector) row.getField(featureHash.getOutputCol()); + + System.out.printf( + "Input Values: %s \tOutput Value: %s\n", + Arrays.toString(inputValues), outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a FeatureHasher instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.featurehasher import FeatureHasher +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data_table = t_env.from_data_stream( + env.from_collection([ + (0, 'a', 1.0, True), + (1, 'c', 1.0, False), + ], + type_info=Types.ROW_NAMED( + ['id', 'f0', 'f1', 'f2'], + [Types.INT(), Types.STRING(), Types.DOUBLE(), Types.BOOLEAN()]))) + +# create a feature hasher object and initialize its parameters +feature_hasher = FeatureHasher() \ + .set_input_cols('f0', 'f1', 'f2') \ + .set_categorical_cols('f0', 'f2') \ + .set_output_col('vec') \ + .set_num_features(1000) + +# use the feature hasher for feature engineering +output = feature_hasher.transform(input_data_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +input_values = [None for _ in feature_hasher.get_input_cols()] +for result in t_env.to_data_stream(output).execute_and_collect(): + for i in range(len(feature_hasher.get_input_cols())): + input_values[i] = result[field_names.index(feature_hasher.get_input_cols()[i])] + output_value = result[field_names.index(feature_hasher.get_output_col())] + print('Input Values: ' + str(input_values) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/hashingtf.md b/docs/content.zh/docs/operators/feature/hashingtf.md new file mode 100644 index 000000000..d340d9096 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/hashingtf.md @@ -0,0 +1,165 @@ +--- +title: "HashingTF" +weight: 1 +type: docs +aliases: +- /operators/feature/hashingtf.html +--- + + + +## HashingTF + +HashingTF maps a sequence of terms(strings, numbers, booleans) +to a sparse vector with a specified dimension using the hashing +trick. If multiple features are projected into the same column, +the output values are accumulated by default. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:----------------------------------------------|:----------|:-------------------------| +| inputCol | List/Array of primitive data types or strings | `"input"` | Input sequence of terms. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------------|:-----------|:----------------------| +| outputCol | SparseVector | `"output"` | Output sparse vector. | + +### Parameters + +| Key | Default | Type | Required | Description | +|:------------|:-----------|:--------|:---------|:--------------------------------------------------------------------| +| binary | `false` | Boolean | no | Whether each dimension of the output vector is binary or not. | +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| numFeatures | `262144` | Integer | no | The number of features. It will be the length of the output vector. | + + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java + +import org.apache.flink.ml.feature.hashingtf.HashingTF; +import org.apache.flink.ml.linalg.SparseVector; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; +import java.util.List; + +/** Simple program that creates a HashingTF instance and uses it for feature engineering. */ +public class HashingTFExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of( + Arrays.asList( + "HashingTFTest", "Hashing", "Term", "Frequency", "Test")), + Row.of( + Arrays.asList( + "HashingTFTest", "Hashing", "Hashing", "Test", "Test"))); + + Table inputTable = tEnv.fromDataStream(inputStream).as("input"); + + // Creates a HashingTF object and initializes its parameters. + HashingTF hashingTF = + new HashingTF().setInputCol("input").setOutputCol("output").setNumFeatures(128); + + // Uses the HashingTF object for feature transformations. + Table outputTable = hashingTF.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + List inputValue = (List) row.getField(hashingTF.getInputCol()); + SparseVector outputValue = (SparseVector) row.getField(hashingTF.getOutputCol()); + + System.out.printf( + "Input Value: %s \tOutput Value: %s\n", + Arrays.toString(inputValue.stream().toArray()), outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a HashingTF instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.hashingtf import HashingTF +from pyflink.table import StreamTableEnvironment + +env = StreamExecutionEnvironment.get_execution_environment() + +t_env = StreamTableEnvironment.create(env) + +# Generates input data. +input_data_table = t_env.from_data_stream( + env.from_collection([ + (['HashingTFTest', 'Hashing', 'Term', 'Frequency', 'Test'],), + (['HashingTFTest', 'Hashing', 'Hashing', 'Test', 'Test'],), + ], + type_info=Types.ROW_NAMED( + ["input", ], + [Types.OBJECT_ARRAY(Types.STRING())]))) + +# Creates a HashingTF object and initializes its parameters. +hashing_tf = HashingTF() \ + .set_input_col('input') \ + .set_num_features(128) \ + .set_output_col('output') + +# Uses the HashingTF object for feature transformations. +output = hashing_tf.transform(input_data_table)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(hashing_tf.get_input_col())] + output_value = result[field_names.index(hashing_tf.get_output_col())] + print('Input Value: ' + ' '.join(input_value) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/idf.md b/docs/content.zh/docs/operators/feature/idf.md new file mode 100644 index 000000000..ca26c286a --- /dev/null +++ b/docs/content.zh/docs/operators/feature/idf.md @@ -0,0 +1,172 @@ +--- +title: "IDF" +weight: 1 +type: docs +aliases: +- /operators/feature/IDF.html +--- + + + +## IDF + +IDF computes the inverse document frequency (IDF) for the +input documents. IDF is computed following +`idf = log((m + 1) / (d(t) + 1))`, where `m` is the total +number of documents and `d(t)` is the number of documents +that contains `t`. + +IDFModel further uses the computed inverse document frequency +to compute [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-----------------| +| inputCol | Vector | `"input"` | Input documents. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:----------------------------| +| outputCol | Vector | `"output"` | Tf-idf values of the input. | + +### Parameters + +Below are the parameters required by `IDFModel`. + +| Key | Default | Type | Required | Description | +|:----------|:-----------|:-------|:---------|:--------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | + + +`IDF` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|:-----------|:-----------|:--------|:---------|:---------------------------------------------------------------------| +| minDocFreq | `0` | Integer | no | Minimum number of documents that a term should appear for filtering. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.idf.IDF; +import org.apache.flink.ml.feature.idf.IDFModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains an IDF model and uses it for feature engineering. */ +public class IDFExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(Vectors.dense(0, 1, 0, 2)), + Row.of(Vectors.dense(0, 1, 2, 3)), + Row.of(Vectors.dense(0, 1, 0, 0))); + + Table inputTable = tEnv.fromDataStream(inputStream).as("input"); + + // Creates an IDF object and initializes its parameters. + IDF idf = new IDF().setMinDocFreq(2); + + // Trains the IDF Model. + IDFModel model = idf.fit(inputTable); + + // Uses the IDF Model for predictions. + Table outputTable = model.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector inputValue = (DenseVector) row.getField(idf.getInputCol()); + DenseVector outputValue = (DenseVector) row.getField(idf.getOutputCol()); + System.out.printf("Input Value: %s\tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains an IDF model and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.idf import IDF +from pyflink.table import StreamTableEnvironment + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates input for training and prediction. +input_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(0, 1, 0, 2),), + (Vectors.dense(0, 1, 2, 3),), + (Vectors.dense(0, 1, 0, 0),), + ], + type_info=Types.ROW_NAMED( + ['input', ], + [DenseVectorTypeInfo(), ]))) + +# Creates an IDF object and initializes its parameters. +idf = IDF().set_min_doc_freq(2) + +# Trains the IDF Model. +model = idf.fit(input_table) + +# Uses the IDF Model for predictions. +output = model.transform(input_table)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_index = field_names.index(idf.get_input_col()) + output_index = field_names.index(idf.get_output_col()) + print('Input Value: ' + str(result[input_index]) + + '\tOutput Value: ' + str(result[output_index])) +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/imputer.md b/docs/content.zh/docs/operators/feature/imputer.md new file mode 100644 index 000000000..327d31fb3 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/imputer.md @@ -0,0 +1,196 @@ +--- +title: "Imputer" +weight: 1 +type: docs +aliases: +- /operators/feature/imputer.html +--- + + + +## Imputer +The imputer for completing missing values of the input columns. + +Missing values can be imputed using the statistics(mean, median or +most frequent) of each column in which the missing values are located. +The input columns should be of numeric type. + +__Note__ The `mean`/`median`/`most frequent` value is computed after +filtering out missing values and null values, null values are always +treated as missing, and so are also imputed. + +__Note__ The parameter `relativeError` is only effective when the strategy + is `median`. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:--------|:------------------------| +| inputCols | Number | `null` | Features to be imputed. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:--------|:------------------| +| outputCols | Double | `null` | Imputed features. | + +### Parameters + +Below are the parameters required by `ImputerModel`. + +| Key | Default | Type | Required | Description | +|:--------------|:-------------|:------------|:---------|:-------------------------------------------------------------------------------------------| +| inputCols | `null` | String[] | yes | Input column names. | +| outputCols | `null` | String[] | yes | Output column names. | +| missingValue | `Double.NaN` | Double | no | The placeholder for the missing values. All occurrences of missing values will be imputed. | + +`Imputer` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|:--------------|:-------------|:------------|:---------|:------------------------------------------------------------------------------| +| strategy | `"mean"` | String | no | The imputation strategy. Supported values: 'mean', 'median', 'most_frequent'. | +| relativeError | `0.001` | Double | no | The relative target precision for the approximate quantile algorithm. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.imputer.Imputer; +import org.apache.flink.ml.feature.imputer.ImputerModel; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that trains a {@link Imputer} model and uses it for feature engineering. */ +public class ImputerExample { + + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements( + Row.of(Double.NaN, 9.0), + Row.of(1.0, 9.0), + Row.of(1.5, 9.0), + Row.of(2.5, Double.NaN), + Row.of(5.0, 5.0), + Row.of(5.0, 4.0)); + Table trainTable = tEnv.fromDataStream(trainStream).as("input1", "input2"); + + // Creates an Imputer object and initialize its parameters + Imputer imputer = + new Imputer() + .setInputCols("input1", "input2") + .setOutputCols("output1", "output2") + .setStrategy("mean") + .setMissingValue(Double.NaN); + + // Trains the Imputer model. + ImputerModel model = imputer.fit(trainTable); + + // Uses the Imputer model for predictions. + Table outputTable = model.transform(trainTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + double[] inputValues = new double[imputer.getInputCols().length]; + double[] outputValues = new double[imputer.getInputCols().length]; + for (int i = 0; i < inputValues.length; i++) { + inputValues[i] = (double) row.getField(imputer.getInputCols()[i]); + outputValues[i] = (double) row.getField(imputer.getOutputCols()[i]); + } + System.out.printf( + "Input Values: %s\tOutput Values: %s\n", + Arrays.toString(inputValues), Arrays.toString(outputValues)); + } + } +} +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python + +# Simple program that creates an Imputer instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.imputer import Imputer +from pyflink.table import StreamTableEnvironment + +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input training and prediction data +train_data = t_env.from_data_stream( + env.from_collection([ + (float('NaN'), 9.0,), + (1.0, 9.0,), + (1.5, 7.0,), + (1.5, float('NaN'),), + (4.0, 5.0,), + (None, 4.0,), + ], + type_info=Types.ROW_NAMED( + ['input1', 'input2'], + [Types.DOUBLE(), Types.DOUBLE()]) + )) + +# Creates an Imputer object and initializes its parameters. +imputer = Imputer()\ + .set_input_cols('input1', 'input2')\ + .set_output_cols('output1', 'output2')\ + .set_strategy('mean')\ + .set_missing_value(float('NaN')) + +# Trains the Imputer Model. +model = imputer.fit(train_data) + +# Uses the Imputer Model for predictions. +output = model.transform(train_data)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_values = [] + output_values = [] + for i in range(len(imputer.get_input_cols())): + input_values.append(result[field_names.index(imputer.get_input_cols()[i])]) + output_values.append(result[field_names.index(imputer.get_output_cols()[i])]) + print('Input Values: ' + str(input_values) + '\tOutput Values: ' + str(output_values)) +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/indextostring.md b/docs/content.zh/docs/operators/feature/indextostring.md new file mode 100644 index 000000000..80e341e13 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/indextostring.md @@ -0,0 +1,184 @@ +--- +title: "IndexToString" +weight: 1 +type: docs +aliases: +- /operators/feature/indextostring.html +--- + + + +## IndexToString + +`IndexToStringModel` transforms input index column(s) to string column(s) using +the model data computed by StringIndexer. It is a reverse operation of +StringIndexerModel. + +### Input Columns + +| Param name | Type | Default | Description | +| :--------- | :------ | :------ | :----------------------------------- | +| inputCols | Integer | `null` | Indices to be transformed to string. | + +### Output Columns + +| Param name | Type | Default | Description | +| :--------- | :----- | :------ | :------------------- | +| outputCols | String | `null` | Transformed strings. | + +### Parameters + +Below are the parameters required by `StringIndexerModel`. + +| Key | Default | Type | Required | Description | +| ---------- | ------- | ------ | -------- | -------------------- | +| inputCols | `null` | String | yes | Input column names. | +| outputCols | `null` | String | yes | Output column names. | + +### Examples + +{{< tabs index_to_string_examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.stringindexer.IndexToStringModel; +import org.apache.flink.ml.feature.stringindexer.StringIndexerModelData; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** + * Simple program that creates an IndexToStringModelExample instance and uses it for feature + * engineering. + */ +public class IndexToStringModelExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Creates model data for IndexToStringModel. + StringIndexerModelData modelData = + new StringIndexerModelData( + new String[][] {{"a", "b", "c", "d"}, {"-1.0", "0.0", "1.0", "2.0"}}); + Table modelTable = tEnv.fromDataStream(env.fromElements(modelData)).as("stringArrays"); + + // Generates input data. + DataStream predictStream = env.fromElements(Row.of(0, 3), Row.of(1, 2)); + Table predictTable = tEnv.fromDataStream(predictStream).as("inputCol1", "inputCol2"); + + // Creates an indexToStringModel object and initializes its parameters. + IndexToStringModel indexToStringModel = + new IndexToStringModel() + .setInputCols("inputCol1", "inputCol2") + .setOutputCols("outputCol1", "outputCol2") + .setModelData(modelTable); + + // Uses the indexToStringModel object for feature transformations. + Table outputTable = indexToStringModel.transform(predictTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + int[] inputValues = new int[indexToStringModel.getInputCols().length]; + String[] outputValues = new String[indexToStringModel.getInputCols().length]; + for (int i = 0; i < inputValues.length; i++) { + inputValues[i] = (int) row.getField(indexToStringModel.getInputCols()[i]); + outputValues[i] = (String) row.getField(indexToStringModel.getOutputCols()[i]); + } + + System.out.printf( + "Input Values: %s \tOutput Values: %s\n", + Arrays.toString(inputValues), Arrays.toString(outputValues)); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates an IndexToStringModelExample instance and uses it +# for feature engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.stringindexer import IndexToStringModel +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +predict_table = t_env.from_data_stream( + env.from_collection([ + (0, 3), + (1, 2), + ], + type_info=Types.ROW_NAMED( + ['input_col1', 'input_col2'], + [Types.INT(), Types.INT()]) + )) + +# create an index-to-string model and initialize its parameters and model data +model_data_table = t_env.from_data_stream( + env.from_collection([ + ([['a', 'b', 'c', 'd'], [-1., 0., 1., 2.]],), + ], + type_info=Types.ROW_NAMED( + ['stringArrays'], + [Types.OBJECT_ARRAY(Types.OBJECT_ARRAY(Types.STRING()))]) + )) + +model = IndexToStringModel() \ + .set_input_cols('input_col1', 'input_col2') \ + .set_output_cols('output_col1', 'output_col2') \ + .set_model_data(model_data_table) + +# use the index-to-string model for feature engineering +output = model.transform(predict_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +input_values = [None for _ in model.get_input_cols()] +output_values = [None for _ in model.get_input_cols()] +for result in t_env.to_data_stream(output).execute_and_collect(): + for i in range(len(model.get_input_cols())): + input_values[i] = result[field_names.index(model.get_input_cols()[i])] + output_values[i] = result[field_names.index(model.get_output_cols()[i])] + print('Input Values: ' + str(input_values) + '\tOutput Values: ' + str(output_values)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/interaction.md b/docs/content.zh/docs/operators/feature/interaction.md new file mode 100644 index 000000000..1b9bca1bc --- /dev/null +++ b/docs/content.zh/docs/operators/feature/interaction.md @@ -0,0 +1,169 @@ +--- +title: "Interaction" +weight: 1 +type: docs +aliases: +- /operators/feature/interaction.html +--- + + + +## Interaction + +Interaction takes vector or numerical columns, and generates a single vector column that contains +the product of all combinations of one value from each input column. + +For example, when the input feature values are Double(2) and Vector(3, 4), the output would be +Vector(6, 8). When the input feature values are Vector(1, 2) and Vector(3, 4), the output would +be Vector(3, 4, 6, 8). If you change the position of these two input Vectors, the output would +be Vector(3, 6, 4, 8). + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:--------|:--------------------------| +| inputCols | Vector | `null` | Columns to be interacted. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-------------------| +| outputCol | Vector | `"output"` | Interacted vector. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-----------------|------------|-----------|----------|----------------------------| +| inputCols | `null` | String[] | yes | Input column names. | +| outputCol | `"output"` | String | no | Output column name. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.interaction.Interaction; +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that creates an Interaction instance and uses it for feature engineering. */ +public class InteractionExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(0, Vectors.dense(1.1, 3.2), Vectors.dense(2, 3)), + Row.of(1, Vectors.dense(2.1, 3.1), Vectors.dense(1, 3))); + + Table inputTable = tEnv.fromDataStream(inputStream).as("f0", "f1", "f2"); + + // Creates an Interaction object and initializes its parameters. + Interaction interaction = + new Interaction().setInputCols("f0", "f1", "f2").setOutputCol("outputVec"); + + // Transforms input data. + Table outputTable = interaction.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + Object[] inputValues = new Object[interaction.getInputCols().length]; + for (int i = 0; i < inputValues.length; i++) { + inputValues[i] = row.getField(interaction.getInputCols()[i]); + } + Vector outputValue = (Vector) row.getField(interaction.getOutputCol()); + System.out.printf( + "Input Values: %s \tOutput Value: %s\n", + Arrays.toString(inputValues), outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates an Interaction instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.interaction import Interaction +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data_table = t_env.from_data_stream( + env.from_collection([ + (1, + Vectors.dense(1, 2), + Vectors.dense(3, 4)), + (2, + Vectors.dense(2, 8), + Vectors.dense(3, 4)) + ], + type_info=Types.ROW_NAMED( + ['f0', 'f1', 'f2'], + [Types.INT(), DenseVectorTypeInfo(), DenseVectorTypeInfo()]))) + +# create an interaction object and initialize its parameters +interaction = Interaction() \ + .set_input_cols('f0', 'f1', 'f2') \ + .set_output_col('interaction_vec') + +# use the interaction for feature engineering +output = interaction.transform(input_data_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +input_values = [None for _ in interaction.get_input_cols()] +for result in t_env.to_data_stream(output).execute_and_collect(): + for i in range(len(interaction.get_input_cols())): + input_values[i] = result[field_names.index(interaction.get_input_cols()[i])] + output_value = result[field_names.index(interaction.get_output_col())] + print('Input Values: ' + str(input_values) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/kbinsdiscretizer.md b/docs/content.zh/docs/operators/feature/kbinsdiscretizer.md new file mode 100644 index 000000000..b26e7804c --- /dev/null +++ b/docs/content.zh/docs/operators/feature/kbinsdiscretizer.md @@ -0,0 +1,185 @@ +--- +title: "KBinsDiscretizer" +weight: 1 +type: docs +aliases: +- /operators/feature/kbinsdiscretizer.html +--- + + + +## KBinsDiscretizer + +KBinsDiscretizer is an algorithm that implements discretization (also known as +quantization or binning) to transform continuous features into discrete ones. +The output values are in [0, numBins). + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:------------|:----------|:---------------------------| +| inputCol | DenseVector | `"input"` | Vectors to be discretized. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:------------|:-----------|:---------------------| +| outputCol | DenseVector | `"output"` | Discretized vectors. | + +### Parameters + +Below are the parameters required by `KBinsDiscretizerModel`. + +| Key | Default | Type | Required | Description | +|:----------|:-----------|:-------|:---------|:--------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | + +`KBinsDiscretizer` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|:-----------|:-------------|:--------|:---------|:-------------------------------------------------------------------------------------------------| +| strategy | `"quantile"` | String | no | Strategy used to define the width of the bin. Supported values: 'uniform', 'quantile', 'kmeans'. | +| numBins | `5` | Integer | no | Number of bins to produce. | +| subSamples | `200000` | Integer | no | Maximum number of samples used to fit the model. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.kbinsdiscretizer.KBinsDiscretizer; +import org.apache.flink.ml.feature.kbinsdiscretizer.KBinsDiscretizerModel; +import org.apache.flink.ml.feature.kbinsdiscretizer.KBinsDiscretizerParams; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a KBinsDiscretizer model and uses it for feature engineering. */ +public class KBinsDiscretizerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(Vectors.dense(1, 10, 0)), + Row.of(Vectors.dense(1, 10, 0)), + Row.of(Vectors.dense(1, 10, 0)), + Row.of(Vectors.dense(4, 10, 0)), + Row.of(Vectors.dense(5, 10, 0)), + Row.of(Vectors.dense(6, 10, 0)), + Row.of(Vectors.dense(7, 10, 0)), + Row.of(Vectors.dense(10, 10, 0)), + Row.of(Vectors.dense(13, 10, 3))); + Table inputTable = tEnv.fromDataStream(inputStream).as("input"); + + // Creates a KBinsDiscretizer object and initializes its parameters. + KBinsDiscretizer kBinsDiscretizer = + new KBinsDiscretizer().setNumBins(3).setStrategy(KBinsDiscretizerParams.UNIFORM); + + // Trains the KBinsDiscretizer Model. + KBinsDiscretizerModel model = kBinsDiscretizer.fit(inputTable); + + // Uses the KBinsDiscretizer Model for predictions. + Table outputTable = model.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector inputValue = (DenseVector) row.getField(kBinsDiscretizer.getInputCol()); + DenseVector outputValue = (DenseVector) row.getField(kBinsDiscretizer.getOutputCol()); + System.out.printf("Input Value: %s\tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a KBinsDiscretizer model and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.kbinsdiscretizer import KBinsDiscretizer +from pyflink.table import StreamTableEnvironment + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates input for training and prediction. +input_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(1, 10, 0),), + (Vectors.dense(1, 10, 0),), + (Vectors.dense(1, 10, 0),), + (Vectors.dense(4, 10, 0),), + (Vectors.dense(5, 10, 0),), + (Vectors.dense(6, 10, 0),), + (Vectors.dense(7, 10, 0),), + (Vectors.dense(10, 10, 0),), + (Vectors.dense(13, 10, 0),), + ], + type_info=Types.ROW_NAMED( + ['input', ], + [DenseVectorTypeInfo(), ]))) + +# Creates a KBinsDiscretizer object and initializes its parameters. +k_bins_discretizer = KBinsDiscretizer() \ + .set_input_col('input') \ + .set_output_col('output') \ + .set_num_bins(3) \ + .set_strategy('uniform') + +# Trains the KBinsDiscretizer Model. +model = k_bins_discretizer.fit(input_table) + +# Uses the KBinsDiscretizer Model for predictions. +output = model.transform(input_table)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + print('Input Value: ' + str(result[field_names.index(k_bins_discretizer.get_input_col())]) + + '\tOutput Value: ' + + str(result[field_names.index(k_bins_discretizer.get_output_col())])) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/maxabsscaler.md b/docs/content.zh/docs/operators/feature/maxabsscaler.md new file mode 100644 index 000000000..1c3d4fd91 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/maxabsscaler.md @@ -0,0 +1,180 @@ +--- +title: "MaxAbsScaler" +weight: 1 +type: docs +aliases: +- /operators/feature/maxabsscaler.html +--- + + + +## MaxAbsScaler + +MaxAbsScaler is an algorithm rescales feature values to the range [-1, 1] +by dividing through the largest maximum absolute value in each feature. +It does not shift/center the data and thus does not destroy any sparsity. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-----------------------| +| inputCol | Vector | `"input"` | Features to be scaled. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-----------------| +| outputCol | Vector | `"output"` | Scaled features. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-----------|------------|--------|----------|---------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.maxabsscaler.MaxAbsScaler; +import org.apache.flink.ml.feature.maxabsscaler.MaxAbsScalerModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a MaxAbsScaler model and uses it for feature engineering. */ +public class MaxAbsScalerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements( + Row.of(Vectors.dense(0.0, 3.0)), + Row.of(Vectors.dense(2.1, 0.0)), + Row.of(Vectors.dense(4.1, 5.1)), + Row.of(Vectors.dense(6.1, 8.1)), + Row.of(Vectors.dense(200, 400))); + Table trainTable = tEnv.fromDataStream(trainStream).as("input"); + + DataStream predictStream = + env.fromElements( + Row.of(Vectors.dense(150.0, 90.0)), + Row.of(Vectors.dense(50.0, 40.0)), + Row.of(Vectors.dense(100.0, 50.0))); + Table predictTable = tEnv.fromDataStream(predictStream).as("input"); + + // Creates a MaxAbsScaler object and initializes its parameters. + MaxAbsScaler maxAbsScaler = new MaxAbsScaler(); + + // Trains the MaxAbsScaler Model. + MaxAbsScalerModel maxAbsScalerModel = maxAbsScaler.fit(trainTable); + + // Uses the MaxAbsScaler Model for predictions. + Table outputTable = maxAbsScalerModel.transform(predictTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector inputValue = (DenseVector) row.getField(maxAbsScaler.getInputCol()); + DenseVector outputValue = (DenseVector) row.getField(maxAbsScaler.getOutputCol()); + System.out.printf("Input Value: %-15s\tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a MaxAbsScaler model and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.maxabsscaler import MaxAbsScaler +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input training and prediction data +train_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(0.0, 3.0),), + (Vectors.dense(2.1, 0.0),), + (Vectors.dense(4.1, 5.1),), + (Vectors.dense(6.1, 8.1),), + (Vectors.dense(200, 400),), + ], + type_info=Types.ROW_NAMED( + ['input'], + [DenseVectorTypeInfo()]) + )) + +predict_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(150.0, 90.0),), + (Vectors.dense(50.0, 40.0),), + (Vectors.dense(100.0, 50.0),), + ], + type_info=Types.ROW_NAMED( + ['input'], + [DenseVectorTypeInfo()]) + )) + +# create a maxabs scaler object and initialize its parameters +max_abs_scaler = MaxAbsScaler() + +# train the maxabs scaler model +model = max_abs_scaler.fit(train_data) + +# use the maxabs scaler model for predictions +output = model.transform(predict_data)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(max_abs_scaler.get_input_col())] + output_value = result[field_names.index(max_abs_scaler.get_output_col())] + print('Input Value: ' + str(input_value) + ' \tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/minhashlsh.md b/docs/content.zh/docs/operators/feature/minhashlsh.md new file mode 100644 index 000000000..22ce0b88f --- /dev/null +++ b/docs/content.zh/docs/operators/feature/minhashlsh.md @@ -0,0 +1,288 @@ +--- +title: "MinHashLSH" +weight: 1 +type: docs +aliases: +- /operators/feature/minhashlsh.html +--- + + + +## MinHashLSH + +MinHashLSH is a Locality Sensitive Hashing (LSH) scheme for Jaccard distance metric. +The input features are sets of natural numbers represented as non-zero indices of vectors, +either dense vectors or sparse vectors. Typically, sparse vectors are more efficient. + +In addition to transforming input feature vectors to multiple hash values, the MinHashLSH +model also supports approximate nearest neighbors search within a dataset regarding a key +vector and approximate similarity join between two datasets. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-----------------------| +| inputCol | Vector | `"input"` | Features to be mapped. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:--------------|:-----------|:-------------| +| outputCol | DenseVector[] | `"output"` | Hash values. | + +### Parameters + +Below are the parameters required by `MinHashLSHModel`. + +| Key | Default | Type | Required | Description | +|-------------------------|------------|---------|----------|--------------------------------------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | + +`MinHashLSH` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|-------------------------|------------|---------|----------|--------------------------------------------------------------------| +| seed | `null` | Long | no | The random seed. | +| numHashTables | `1` | Integer | no | Default number of hash tables, for OR-amplification. | +| numHashFunctionPerTable | `1` | Integer | no | Default number of hash functions per table, for AND-amplification. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.ml.feature.lsh.MinHashLSH; +import org.apache.flink.ml.feature.lsh.MinHashLSHModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.SparseVector; +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; + +import org.apache.commons.collections.IteratorUtils; + +import java.util.Arrays; +import java.util.List; + +import static org.apache.flink.table.api.Expressions.$; + +/** + * Simple program that trains a MinHashLSH model and uses it for approximate nearest neighbors and + * similarity join. + */ +public class MinHashLSHExample { + public static void main(String[] args) throws Exception { + + // Creates a new StreamExecutionEnvironment. + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + // Creates a StreamTableEnvironment. + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates two datasets. + Table dataA = + tEnv.fromDataStream( + env.fromCollection( + Arrays.asList( + Row.of( + 0, + Vectors.sparse( + 6, + new int[] {0, 1, 2}, + new double[] {1., 1., 1.})), + Row.of( + 1, + Vectors.sparse( + 6, + new int[] {2, 3, 4}, + new double[] {1., 1., 1.})), + Row.of( + 2, + Vectors.sparse( + 6, + new int[] {0, 2, 4}, + new double[] {1., 1., 1.}))), + Types.ROW_NAMED( + new String[] {"id", "vec"}, + Types.INT, + TypeInformation.of(SparseVector.class)))); + + Table dataB = + tEnv.fromDataStream( + env.fromCollection( + Arrays.asList( + Row.of( + 3, + Vectors.sparse( + 6, + new int[] {1, 3, 5}, + new double[] {1., 1., 1.})), + Row.of( + 4, + Vectors.sparse( + 6, + new int[] {2, 3, 5}, + new double[] {1., 1., 1.})), + Row.of( + 5, + Vectors.sparse( + 6, + new int[] {1, 2, 4}, + new double[] {1., 1., 1.}))), + Types.ROW_NAMED( + new String[] {"id", "vec"}, + Types.INT, + TypeInformation.of(SparseVector.class)))); + + // Creates a MinHashLSH estimator object and initializes its parameters. + MinHashLSH lsh = + new MinHashLSH() + .setInputCol("vec") + .setOutputCol("hashes") + .setSeed(2022) + .setNumHashTables(5); + + // Trains the MinHashLSH model. + MinHashLSHModel model = lsh.fit(dataA); + + // Uses the MinHashLSH model for transformation. + Table output = model.transform(dataA)[0]; + + // Extracts and displays the results. + List fieldNames = output.getResolvedSchema().getColumnNames(); + for (Row result : + (List) IteratorUtils.toList(tEnv.toDataStream(output).executeAndCollect())) { + Vector inputValue = result.getFieldAs(fieldNames.indexOf(lsh.getInputCol())); + DenseVector[] outputValue = result.getFieldAs(fieldNames.indexOf(lsh.getOutputCol())); + System.out.printf( + "Vector: %s \tHash values: %s\n", inputValue, Arrays.toString(outputValue)); + } + + // Finds approximate nearest neighbors of the key. + Vector key = Vectors.sparse(6, new int[] {1, 3}, new double[] {1., 1.}); + output = model.approxNearestNeighbors(dataA, key, 2).select($("id"), $("distCol")); + for (Row result : + (List) IteratorUtils.toList(tEnv.toDataStream(output).executeAndCollect())) { + int idValue = result.getFieldAs(fieldNames.indexOf("id")); + double distValue = result.getFieldAs(result.getArity() - 1); + System.out.printf("ID: %d \tDistance: %f\n", idValue, distValue); + } + + // Approximately finds pairs from two datasets with distances smaller than the threshold. + output = model.approxSimilarityJoin(dataA, dataB, .6, "id"); + for (Row result : + (List) IteratorUtils.toList(tEnv.toDataStream(output).executeAndCollect())) { + int idAValue = result.getFieldAs(0); + int idBValue = result.getFieldAs(1); + double distValue = result.getFieldAs(2); + System.out.printf( + "ID from left: %d \tID from right: %d \t Distance: %f\n", + idAValue, idAValue, distValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a MinHashLSH model and uses it for approximate nearest neighbors +# and similarity join. + + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.table import StreamTableEnvironment +from pyflink.table.expressions import col + +from pyflink.ml.linalg import Vectors, SparseVectorTypeInfo +from pyflink.ml.feature.lsh import MinHashLSH + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates two datasets. +data_a = t_env.from_data_stream( + env.from_collection([ + (0, Vectors.sparse(6, [0, 1, 2], [1., 1., 1.])), + (1, Vectors.sparse(6, [2, 3, 4], [1., 1., 1.])), + (2, Vectors.sparse(6, [0, 2, 4], [1., 1., 1.])), + ], type_info=Types.ROW_NAMED(['id', 'vec'], [Types.INT(), SparseVectorTypeInfo()]))) + +data_b = t_env.from_data_stream( + env.from_collection([ + (3, Vectors.sparse(6, [1, 3, 5], [1., 1., 1.])), + (4, Vectors.sparse(6, [2, 3, 5], [1., 1., 1.])), + (5, Vectors.sparse(6, [1, 2, 4], [1., 1., 1.])), + ], type_info=Types.ROW_NAMED(['id', 'vec'], [Types.INT(), SparseVectorTypeInfo()]))) + +# Creates a MinHashLSH estimator object and initializes its parameters. +lsh = MinHashLSH() \ + .set_input_col('vec') \ + .set_output_col('hashes') \ + .set_seed(2022) \ + .set_num_hash_tables(5) + +# Trains the MinHashLSH model. +model = lsh.fit(data_a) + +# Uses the MinHashLSH model for transformation. +output = model.transform(data_a)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(lsh.get_input_col())] + output_value = result[field_names.index(lsh.get_output_col())] + print(f'Vector: {input_value} \tHash Values: {output_value}') + +# Finds approximate nearest neighbors of the key. +key = Vectors.sparse(6, [1, 3], [1., 1.]) +output = model.approx_nearest_neighbors(data_a, key, 2).select(col("id"), col("distCol")) +for result in t_env.to_data_stream(output).execute_and_collect(): + id_value = result[field_names.index("id")] + dist_value = result[-1] + print(f'ID: {id_value} \tDistance: {dist_value}') + +# Approximately finds pairs from two datasets with distances smaller than the threshold. +output = model.approx_similarity_join(data_a, data_b, .6, "id") +for result in t_env.to_data_stream(output).execute_and_collect(): + id_a_value, id_b_value, dist_value = result + print(f'ID from left: {id_a_value} \tID from right: {id_b_value} \t Distance: {dist_value}') + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/minmaxscaler.md b/docs/content.zh/docs/operators/feature/minmaxscaler.md new file mode 100644 index 000000000..b4427b629 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/minmaxscaler.md @@ -0,0 +1,180 @@ +--- +title: "MinMaxScaler" +weight: 1 +type: docs +aliases: +- /operators/feature/minmaxscaler.html +--- + + + +## MinMaxScaler + +MinMaxScaler is an algorithm that rescales feature values to a common range +[min, max] which defined by user. +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-----------------------| +| inputCol | Vector | `"input"` | Features to be scaled. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-----------------| +| outputCol | Vector | `"output"` | Scaled features. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-----------|------------|--------|----------|------------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| min | `0.0` | Double | no | Lower bound of the output feature range. | +| max | `1.0` | Double | no | Upper bound of the output feature range. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.minmaxscaler.MinMaxScaler; +import org.apache.flink.ml.feature.minmaxscaler.MinMaxScalerModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a MinMaxScaler model and uses it for feature engineering. */ +public class MinMaxScalerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements( + Row.of(Vectors.dense(0.0, 3.0)), + Row.of(Vectors.dense(2.1, 0.0)), + Row.of(Vectors.dense(4.1, 5.1)), + Row.of(Vectors.dense(6.1, 8.1)), + Row.of(Vectors.dense(200, 400))); + Table trainTable = tEnv.fromDataStream(trainStream).as("input"); + + DataStream predictStream = + env.fromElements( + Row.of(Vectors.dense(150.0, 90.0)), + Row.of(Vectors.dense(50.0, 40.0)), + Row.of(Vectors.dense(100.0, 50.0))); + Table predictTable = tEnv.fromDataStream(predictStream).as("input"); + + // Creates a MinMaxScaler object and initializes its parameters. + MinMaxScaler minMaxScaler = new MinMaxScaler(); + + // Trains the MinMaxScaler Model. + MinMaxScalerModel minMaxScalerModel = minMaxScaler.fit(trainTable); + + // Uses the MinMaxScaler Model for predictions. + Table outputTable = minMaxScalerModel.transform(predictTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector inputValue = (DenseVector) row.getField(minMaxScaler.getInputCol()); + DenseVector outputValue = (DenseVector) row.getField(minMaxScaler.getOutputCol()); + System.out.printf("Input Value: %-15s\tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a MinMaxScaler model and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.minmaxscaler import MinMaxScaler +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input training and prediction data +train_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(0.0, 3.0),), + (Vectors.dense(2.1, 0.0),), + (Vectors.dense(4.1, 5.1),), + (Vectors.dense(6.1, 8.1),), + (Vectors.dense(200, 400),), + ], + type_info=Types.ROW_NAMED( + ['input'], + [DenseVectorTypeInfo()]) + )) + +predict_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(150.0, 90.0),), + (Vectors.dense(50.0, 40.0),), + (Vectors.dense(100.0, 50.0),), + ], + type_info=Types.ROW_NAMED( + ['input'], + [DenseVectorTypeInfo()]) + )) + +# create a min-max-scaler object and initialize its parameters +min_max_scaler = MinMaxScaler() + +# train the min-max-scaler model +model = min_max_scaler.fit(train_data) + +# use the min-max-scaler model for predictions +output = model.transform(predict_data)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(min_max_scaler.get_input_col())] + output_value = result[field_names.index(min_max_scaler.get_output_col())] + print('Input Value: ' + str(input_value) + ' \tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/ngram.md b/docs/content.zh/docs/operators/feature/ngram.md new file mode 100644 index 000000000..7544d617c --- /dev/null +++ b/docs/content.zh/docs/operators/feature/ngram.md @@ -0,0 +1,155 @@ +--- +title: "NGram" +weight: 1 +type: docs +aliases: +- /operators/feature/ngram.html +--- + + + +## NGram +NGram converts the input string array into an array of n-grams, +where each n-gram is represented by a space-separated string of +words. If the length of the input array is less than `n`, no +n-grams are returned. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:---------|:----------|:--------------------| +| inputCol | String[] | `"input"` | Input string array. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:---------|:-----------|:------------| +| outputCol | String[] | `"output"` | N-grams. | + +### Parameters + +| Key | Default | Type | Required | Description | +|:----------|:-----------|:--------|:---------|:-------------------------------------| +| n | `2` | Integer | no | Number of elements per n-gram (>=1). | +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.ngram.NGram; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that creates an NGram instance and uses it for feature engineering. */ +public class NGramExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of((Object) new String[0]), + Row.of((Object) new String[] {"a", "b", "c"}), + Row.of((Object) new String[] {"a", "b", "c", "d"})); + Table inputTable = tEnv.fromDataStream(inputStream).as("input"); + + // Creates an NGram object and initializes its parameters. + NGram nGram = new NGram().setN(2).setInputCol("input").setOutputCol("output"); + + // Uses the NGram object for feature transformations. + Table outputTable = nGram.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + String[] inputValue = (String[]) row.getField(nGram.getInputCol()); + String[] outputValue = (String[]) row.getField(nGram.getOutputCol()); + + System.out.printf( + "Input Value: %s \tOutput Value: %s\n", + Arrays.toString(inputValue), Arrays.toString(outputValue)); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates an NGram instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.ngram import NGram +from pyflink.table import StreamTableEnvironment + +env = StreamExecutionEnvironment.get_execution_environment() + +t_env = StreamTableEnvironment.create(env) + +# Generates input data. +input_data_table = t_env.from_data_stream( + env.from_collection([ + ([],), + (['a', 'b', 'c'],), + (['a', 'b', 'c', 'd'],), + ], + type_info=Types.ROW_NAMED( + ["input", ], + [Types.OBJECT_ARRAY(Types.STRING())]))) + +# Creates an NGram object and initializes its parameters. +n_gram = NGram() \ + .set_input_col('input') \ + .set_n(2) \ + .set_output_col('output') + +# Uses the NGram object for feature transformations. +output = n_gram.transform(input_data_table)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(n_gram.get_input_col())] + output_value = result[field_names.index(n_gram.get_output_col())] + print('Input Value: ' + ' '.join(input_value) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/normalizer.md b/docs/content.zh/docs/operators/feature/normalizer.md new file mode 100644 index 000000000..65e3760a4 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/normalizer.md @@ -0,0 +1,154 @@ +--- +title: "Normalizer" +weight: 1 +type: docs +aliases: +- /operators/feature/normalizer.html +--- + + + +## Normalizer + +A Transformer that normalizes a vector to have unit norm using the given p-norm. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:--------------------------| +| inputCol | Vector | `"input"` | Vectors to be normalized. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:--------------------| +| outputCol | Vector | `"output"` | Normalized vectors. | + +### Parameters + +| Key | Default | Type | Required | Description | +|:----------|:-----------|:-------|:---------|:--------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| p | `2.0` | Double | no | The p norm value. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.normalizer.Normalizer; +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that creates a Normalizer instance and uses it for feature engineering. */ +public class NormalizerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(Vectors.dense(2.1, 3.1, 1.2, 3.1, 4.6)), + Row.of(Vectors.dense(1.2, 3.1, 4.6, 2.1, 3.1))); + Table inputTable = tEnv.fromDataStream(inputStream).as("inputVec"); + + // Creates a Normalizer object and initializes its parameters. + Normalizer normalizer = + new Normalizer().setInputCol("inputVec").setP(3.0).setOutputCol("outputVec"); + + // Uses the Normalizer object for feature transformations. + Table outputTable = normalizer.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + Vector inputValue = (Vector) row.getField(normalizer.getInputCol()); + + Vector outputValue = (Vector) row.getField(normalizer.getOutputCol()); + + System.out.printf("Input Value: %s \tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a Normalizer instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.normalizer import Normalizer +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data_table = t_env.from_data_stream( + env.from_collection([ + (1, Vectors.dense(2.1, 3.1, 1.2, 2.1)), + (2, Vectors.dense(2.3, 2.1, 1.3, 1.2)), + ], + type_info=Types.ROW_NAMED( + ['id', 'input_vec'], + [Types.INT(), DenseVectorTypeInfo()]))) + +# create a normalizer object and initialize its parameters +normalizer = Normalizer() \ + .set_input_col('input_vec') \ + .set_p(1.5) \ + .set_output_col('output_vec') + +# use the normalizer model for feature engineering +output = normalizer.transform(input_data_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(normalizer.get_input_col())] + output_value = result[field_names.index(normalizer.get_output_col())] + print('Input Value: ' + str(input_value) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/onehotencoder.md b/docs/content.zh/docs/operators/feature/onehotencoder.md new file mode 100644 index 000000000..32ff2ffa0 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/onehotencoder.md @@ -0,0 +1,161 @@ +--- +title: "OneHotEncoder" +weight: 1 +type: docs +aliases: +- /operators/feature/onehotencoder.html +--- + + +## OneHotEncoder + +OneHotEncoder maps a categorical feature, represented as a label index, to a +binary vector with at most a single one-value indicating the presence of a +specific feature value from among the set of all feature values. This encoding +allows algorithms that expect continuous features, such as Logistic Regression, +to use categorical features. + +OneHotEncoder can transform multiple columns, returning a one-hot-encoded output +vector column for each input column. + +### Input Columns + +| Param name | Type | Default | Description | +| :--------- | :------ | :------ |:-------------| +| inputCols | Integer | `null` | Label index. | + +### Output Columns + +| Param name | Type | Default | Description | +| :--------- | :----- | :------ |:-----------------------| +| outputCols | Vector | `null` | Encoded binary vector. | + +### Parameters + +| Key | Default | Type | Required | Description | +|---------------|-----------|----------|----------|--------------------------------------------------------------------------------| +| inputCols | `null` | String[] | yes | Input column names. | +| outputCols | `null` | String[] | yes | Output column names. | +| handleInvalid | `"error"` | String | no | Strategy to handle invalid entries. Supported values: 'error', 'skip', 'keep'. | +| dropLast | `true` | Boolean | no | Whether to drop the last category. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} +```java +import org.apache.flink.ml.feature.onehotencoder.OneHotEncoder; +import org.apache.flink.ml.feature.onehotencoder.OneHotEncoderModel; +import org.apache.flink.ml.linalg.SparseVector; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a OneHotEncoder model and uses it for feature engineering. */ +public class OneHotEncoderExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements(Row.of(0.0), Row.of(1.0), Row.of(2.0), Row.of(0.0)); + Table trainTable = tEnv.fromDataStream(trainStream).as("input"); + + DataStream predictStream = env.fromElements(Row.of(0.0), Row.of(1.0), Row.of(2.0)); + Table predictTable = tEnv.fromDataStream(predictStream).as("input"); + + // Creates a OneHotEncoder object and initializes its parameters. + OneHotEncoder oneHotEncoder = + new OneHotEncoder().setInputCols("input").setOutputCols("output"); + + // Trains the OneHotEncoder Model. + OneHotEncoderModel model = oneHotEncoder.fit(trainTable); + + // Uses the OneHotEncoder Model for predictions. + Table outputTable = model.transform(predictTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + Double inputValue = (Double) row.getField(oneHotEncoder.getInputCols()[0]); + SparseVector outputValue = + (SparseVector) row.getField(oneHotEncoder.getOutputCols()[0]); + System.out.printf("Input Value: %s\tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` +{{< /tab>}} + +{{< tab "Python">}} +```python +# Simple program that trains a OneHotEncoder model and uses it for feature +# engineering. + +from pyflink.common import Row +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.onehotencoder import OneHotEncoder +from pyflink.table import StreamTableEnvironment, DataTypes + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input training and prediction data +train_table = t_env.from_elements( + [Row(0.0), Row(1.0), Row(2.0), Row(0.0)], + DataTypes.ROW([ + DataTypes.FIELD('input', DataTypes.DOUBLE()) + ])) + +predict_table = t_env.from_elements( + [Row(0.0), Row(1.0), Row(2.0)], + DataTypes.ROW([ + DataTypes.FIELD('input', DataTypes.DOUBLE()) + ])) + +# create a one-hot-encoder object and initialize its parameters +one_hot_encoder = OneHotEncoder().set_input_cols('input').set_output_cols('output') + +# train the one-hot-encoder model +model = one_hot_encoder.fit(train_table) + +# use the one-hot-encoder model for predictions +output = model.transform(predict_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(one_hot_encoder.get_input_cols()[0])] + output_value = result[field_names.index(one_hot_encoder.get_output_cols()[0])] + print('Input Value: ' + str(input_value) + ' \tOutput Value: ' + str(output_value)) + +``` +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/onlinestandardscaler.md b/docs/content.zh/docs/operators/feature/onlinestandardscaler.md new file mode 100644 index 000000000..ef998f1f8 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/onlinestandardscaler.md @@ -0,0 +1,259 @@ +--- +title: "OnlineStandardScaler" +weight: 1 +type: docs +aliases: +- /operators/feature/onlinestandardscaler.html +--- + + + +## OnlineStandardScaler + +An Estimator which implements the online standard scaling algorithm, which +is the online version of StandardScaler. + +OnlineStandardScaler splits the input data by the user-specified window strategy. +For each window, it computes the mean and standard deviation using the data seen +so far (i.e., not only the data in the current window, but also the history data). +The model data generated by OnlineStandardScaler is a model stream. +There is one model data for each window. + +During the inference phase (i.e., using OnlineStandardScalerModel for prediction), +users could output the model version that is used for predicting each data point. +Moreover, +- When the train data and test data both contain event time, users could +specify the maximum difference between the timestamps of the input and model data, +which enforces to use a relatively fresh model for prediction. +- Otherwise, the prediction process always uses the current model data for prediction. + + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-----------------------| +| inputCol | Vector | `"input"` | Features to be scaled. | + +### Output Columns + +| Param name | Type | Default | Description | +|:----------------|:-------|:-----------|:---------------------------------------------------------------------------------------------------------------------------------------------------| +| outputCol | Vector | `"output"` | Scaled features. | +| modelVersionCol | String | `version` | The name of the column which contains the version of the model data that the input data is predicted with. The version should be a 64-bit integer. | + +### Parameters + +Below are the parameters required by `OnlineStandardScalerModel`. + +| Key | Default | Type | Required | Description | +|------------------------|------------|---------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| withMean | `false` | Boolean | no | Whether centers the data with mean before scaling. | +| withStd | `true` | Boolean | no | Whether scales the data with standard deviation. | +| modelVersionCol | `version` | String | no | The name of the column which contains the version of the model data that the input data is predicted with. The version should be a 64-bit integer. | +| maxAllowedModelDelayMs | `0L` | Long | no | The maximum difference allowed between the timestamps of the input record and the model data that is used to predict that input record. This param only works when the input contains event time. | + +`OnlineStandardScaler` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|---------|-------------------------------|---------|----------|--------------------------------------------------------------------------------| +| windows | `GlobalWindows.getInstance()` | Windows | no | Windowing strategy that determines how to create mini-batches from input data. | + + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner; +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.api.common.time.Time; +import org.apache.flink.ml.common.window.EventTimeTumblingWindows; +import org.apache.flink.ml.feature.standardscaler.OnlineStandardScaler; +import org.apache.flink.ml.feature.standardscaler.OnlineStandardScalerModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.ml.linalg.typeinfo.DenseVectorTypeInfo; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; +import java.util.List; + +/** Simple program that trains a OnlineStandardScaler model and uses it for feature engineering. */ +public class OnlineStandardScalerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + List inputData = + Arrays.asList( + Row.of(0L, Vectors.dense(-2.5, 9, 1)), + Row.of(1000L, Vectors.dense(1.4, -5, 1)), + Row.of(2000L, Vectors.dense(2, -1, -2)), + Row.of(6000L, Vectors.dense(0.7, 3, 1)), + Row.of(7000L, Vectors.dense(0, 1, 1)), + Row.of(8000L, Vectors.dense(0.5, 0, -2)), + Row.of(9000L, Vectors.dense(0.4, 1, 1)), + Row.of(10000L, Vectors.dense(0.3, 2, 1)), + Row.of(11000L, Vectors.dense(0.5, 1, -2))); + + DataStream inputStream = env.fromCollection(inputData); + + DataStream inputStreamWithEventTime = + inputStream.assignTimestampsAndWatermarks( + WatermarkStrategy.forMonotonousTimestamps() + .withTimestampAssigner( + (SerializableTimestampAssigner) + (element, recordTimestamp) -> + element.getFieldAs(0))); + + Table inputTable = + tEnv.fromDataStream( + inputStreamWithEventTime, + Schema.newBuilder() + .column("f0", DataTypes.BIGINT()) + .column("f1", DataTypes.RAW(DenseVectorTypeInfo.INSTANCE)) + .columnByMetadata("rowtime", "TIMESTAMP_LTZ(3)") + .watermark("rowtime", "SOURCE_WATERMARK()") + .build()) + .as("id", "input"); + + // Creates an OnlineStandardScaler object and initializes its parameters. + long windowSizeMs = 3000; + OnlineStandardScaler onlineStandardScaler = + new OnlineStandardScaler() + .setWindows(EventTimeTumblingWindows.of(Time.milliseconds(windowSizeMs))); + + // Trains the OnlineStandardScaler Model. + OnlineStandardScalerModel model = onlineStandardScaler.fit(inputTable); + + // Uses the OnlineStandardScaler Model for predictions. + Table outputTable = model.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector inputValue = (DenseVector) row.getField(onlineStandardScaler.getInputCol()); + DenseVector outputValue = + (DenseVector) row.getField(onlineStandardScaler.getOutputCol()); + long modelVersion = row.getFieldAs(onlineStandardScaler.getModelVersionCol()); + System.out.printf( + "Input Value: %s\tOutput Value: %s\tModel Version: %s\n", + inputValue, outputValue, modelVersion); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains an OnlineStandardScaler model and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.common.time import Time, Instant +from pyflink.java_gateway import get_gateway +from pyflink.table import Schema +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.table import StreamTableEnvironment +from pyflink.table.expressions import col + +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.onlinestandardscaler import OnlineStandardScaler +from pyflink.ml.common.window import EventTimeTumblingWindows + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates input data. +dense_vector_serializer = get_gateway().jvm.org.apache.flink.table.types.logical.RawType( + get_gateway().jvm.org.apache.flink.ml.linalg.DenseVector(0).getClass(), + get_gateway().jvm.org.apache.flink.ml.linalg.typeinfo.DenseVectorSerializer() +).getSerializerString() + +schema = Schema.new_builder() + .column("ts", "TIMESTAMP_LTZ(3)") + .column("input", "RAW('org.apache.flink.ml.linalg.DenseVector', '{serializer}')" + .format(serializer=dense_vector_serializer)) + .watermark("ts", "ts - INTERVAL '1' SECOND") + .build() + +input_data = t_env.from_data_stream( + env.from_collection([ + (Instant.of_epoch_milli(0), Vectors.dense(-2.5, 9, 1),), + (Instant.of_epoch_milli(1000), Vectors.dense(1.4, -5, 1),), + (Instant.of_epoch_milli(2000), Vectors.dense(2, -1, -2),), + (Instant.of_epoch_milli(6000), Vectors.dense(0.7, 3, 1),), + (Instant.of_epoch_milli(7000), Vectors.dense(0, 1, 1),), + (Instant.of_epoch_milli(8000), Vectors.dense(0.5, 0, -2),), + (Instant.of_epoch_milli(9000), Vectors.dense(0.4, 1, 1),), + (Instant.of_epoch_milli(10000), Vectors.dense(0.3, 2, 1),), + (Instant.of_epoch_milli(11000), Vectors.dense(0.5, 1, -2),) + ], + type_info=Types.ROW_NAMED( + ['ts', 'input'], + [Types.INSTANT(), DenseVectorTypeInfo()])), + schema) + +# Creates an online standard-scaler object and initialize its parameters. +standard_scaler = OnlineStandardScaler() + .set_windows(EventTimeTumblingWindows.of(Time.milliseconds(3000))) + .set_max_allowed_model_delay_ms(0) + +# Trains the online standard-scaler model. +model = standard_scaler.fit(input_data) + +# Use the standard-scaler model for predictions. +output = model.transform(input_data)[0] + +# extract and display the results +output = output.select(col("input"), col("output"), col("version")) +field_names = output.get_schema().get_field_names() + +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(standard_scaler.get_input_col())] + output_value = result[field_names.index(standard_scaler.get_output_col())] + model_version = result[field_names.index(standard_scaler.get_model_version_col())] + print('Input Value: ' + str(input_value) + ' \tOutput Value: ' + str(output_value) + + '\tModel Version: ' + str(model_version)) + +``` + +{{< /tab>}} + +{{< /tabs>}} \ No newline at end of file diff --git a/docs/content.zh/docs/operators/feature/polynomialexpansion.md b/docs/content.zh/docs/operators/feature/polynomialexpansion.md new file mode 100644 index 000000000..a7f5ee899 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/polynomialexpansion.md @@ -0,0 +1,160 @@ +--- +title: "PolynomialExpansion" +weight: 1 +type: docs +aliases: +- /operators/feature/polynomialexpansion.html +--- + + + +## PolynomialExpansion + +A Transformer that expands the input vectors in polynomial space. + +Take a 2-dimension vector as an example: `(x, y)`, if we want to expand it with degree 2, then +we get `(x, x * x, y, x * y, y * y)`. + +

For more information about the polynomial expansion, see +http://en.wikipedia.org/wiki/Polynomial_expansion. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:------------------------| +| inputCol | Vector | `"input"` | Vectors to be expanded. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:------------------| +| outputCol | Vector | `"output"` | Expanded vectors. | + +### Parameters + +| Key | Default | Type | Required | Description | +|:----------|:-----------|:--------|:---------|:------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| degree | `2` | Integer | no | Degree of the polynomial expansion. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.polynomialexpansion.PolynomialExpansion; +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that creates a PolynomialExpansion instance and uses it for feature engineering. */ +public class PolynomialExpansionExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(Vectors.dense(2.1, 3.1, 1.2)), + Row.of(Vectors.dense(1.2, 3.1, 4.6))); + Table inputTable = tEnv.fromDataStream(inputStream).as("inputVec"); + + // Creates a PolynomialExpansion object and initializes its parameters. + PolynomialExpansion polynomialExpansion = + new PolynomialExpansion().setInputCol("inputVec").setDegree(2).setOutputCol("outputVec"); + + // Uses the PolynomialExpansion object for feature transformations. + Table outputTable = polynomialExpansion.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + Vector inputValue = (Vector) row.getField(polynomialExpansion.getInputCol()); + + Vector outputValue = (Vector) row.getField(polynomialExpansion.getOutputCol()); + + System.out.printf("Input Value: %s \tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a PolynomialExpansion instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.polynomialexpansion import PolynomialExpansion +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data_table = t_env.from_data_stream( + env.from_collection([ + (1, Vectors.dense(2.1, 3.1, 1.2, 2.1)), + (2, Vectors.dense(2.3, 2.1, 1.3, 1.2)), + ], + type_info=Types.ROW_NAMED( + ['id', 'input_vec'], + [Types.INT(), DenseVectorTypeInfo()]))) + +# create a polynomial expansion object and initialize its parameters +polynomialExpansion = PolynomialExpansion() \ + .set_input_col('input_vec') \ + .set_degree(2) \ + .set_output_col('output_vec') + +# use the polynomial expansion model for feature engineering +output = polynomialExpansion.transform(input_data_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(polynomialExpansion.get_input_col())] + output_value = result[field_names.index(polynomialExpansion.get_output_col())] + print('Input Value: ' + str(input_value) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/randomsplitter.md b/docs/content.zh/docs/operators/feature/randomsplitter.md new file mode 100644 index 000000000..7bea670a4 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/randomsplitter.md @@ -0,0 +1,148 @@ +--- +title: "RandomSplitter" +weight: 1 +type: docs +aliases: +- /operators/feature/randomsplitter.html +--- + + + +## RandomSplitter + +An AlgoOperator which splits a table into N tables according to the given weights. + +### Parameters + +| Key | Default | Type | Required | Description | +|:--------|:-------------|:---------|:---------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------| +| weights | `[1.0, 1.0]` | Double[] | no | The weights of data splitting. | +| seed | `null` | Long | no | The random seed. This parameter guarantees reproduciable output only when the paralleism is unchanged and each worker reads the same data in the same order. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.randomsplitter.RandomSplitter; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that creates a RandomSplitter instance and uses it for data splitting. */ +public class RandomSplitterExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(1, 10, 0), + Row.of(1, 10, 0), + Row.of(1, 10, 0), + Row.of(4, 10, 0), + Row.of(5, 10, 0), + Row.of(6, 10, 0), + Row.of(7, 10, 0), + Row.of(10, 10, 0), + Row.of(13, 10, 3)); + Table inputTable = tEnv.fromDataStream(inputStream).as("input"); + + // Creates a RandomSplitter object and initializes its parameters. + RandomSplitter splitter = new RandomSplitter().setWeights(4.0, 6.0); + + // Uses the RandomSplitter to split inputData. + Table[] outputTables = splitter.transform(inputTable); + + // Extracts and displays the results. + System.out.println("Split Result 1 (40%)"); + for (CloseableIterator it = outputTables[0].execute().collect(); it.hasNext(); ) { + System.out.printf("%s\n", it.next()); + } + System.out.println("Split Result 2 (60%)"); + for (CloseableIterator it = outputTables[1].execute().collect(); it.hasNext(); ) { + System.out.printf("%s\n", it.next()); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a RandomSplitter instance and uses it for data splitting. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.randomsplitter import RandomSplitter +from pyflink.table import StreamTableEnvironment + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates input table. +input_table = t_env.from_data_stream( + env.from_collection([ + (1, 10, 0), + (1, 10, 0), + (1, 10, 0), + (4, 10, 0), + (5, 10, 0), + (6, 10, 0), + (7, 10, 0), + (10, 10, 0), + (13, 10, 0) + ], + type_info=Types.ROW_NAMED( + ['f0', 'f1', "f2"], + [Types.INT(), Types.INT(), Types.INT()]))) + +# Creates a RandomSplitter object and initializes its parameters. +splitter = RandomSplitter().set_weights(4.0, 6.0) + +# Uses the RandomSplitter to split the dataset. +output = splitter.transform(input_table) + +# Extracts and displays the results. +print("Split Result 1 (40%)") +for result in t_env.to_data_stream(output[0]).execute_and_collect(): + print(str(result)) + +print("Split Result 2 (60%)") +for result in t_env.to_data_stream(output[1]).execute_and_collect(): + print(str(result)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/regextokenizer.md b/docs/content.zh/docs/operators/feature/regextokenizer.md new file mode 100644 index 000000000..a0e4caeb4 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/regextokenizer.md @@ -0,0 +1,156 @@ +--- +title: "RegexTokenizer" +weight: 1 +type: docs +aliases: +- /operators/feature/regextokenizer.html +--- + + + +## RegexTokenizer + +RegexTokenizer is an algorithm that converts the input string +to lowercase and then splits it by white spaces based on regex. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-------------------------| +| inputCol | String | `"input"` | Strings to be tokenized. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:---------|:-----------|:-------------------| +| outputCol | String[] | `"output"` | Tokenized Strings. | + +### Parameters + +| Key | Default | Type | Required | Description | +|:---------------|:-----------|:--------|:---------|:------------------------------------------------------------------| +| minTokenLength | `1` | Integer | no | Minimum token length. | +| gaps | `true` | Boolean | no | Set regex to match gaps or tokens. | +| pattern | `"\s+"` | String | no | Regex pattern used for tokenizing. | +| toLowercase | `true` | Boolean | no | Whether to convert all characters to lowercase before tokenizing. | +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.regextokenizer.RegexTokenizer; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that creates a RegexTokenizer instance and uses it for feature engineering. */ +public class RegexTokenizerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements(Row.of("Test for tokenization."), Row.of("Te,st. punct")); + Table inputTable = tEnv.fromDataStream(inputStream).as("input"); + + // Creates a RegexTokenizer object and initializes its parameters. + RegexTokenizer regexTokenizer = + new RegexTokenizer() + .setInputCol("input") + .setOutputCol("output") + .setPattern("\\w+|\\p{Punct}"); + + // Uses the Tokenizer object for feature transformations. + Table outputTable = regexTokenizer.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + String inputValue = (String) row.getField(regexTokenizer.getInputCol()); + String[] outputValues = (String[]) row.getField(regexTokenizer.getOutputCol()); + + System.out.printf( + "Input Value: %s \tOutput Values: %s\n", + inputValue, Arrays.toString(outputValues)); + } + } +} + + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a RegexTokenizer instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.regextokenizer import RegexTokenizer +from pyflink.table import StreamTableEnvironment + +env = StreamExecutionEnvironment.get_execution_environment() + +t_env = StreamTableEnvironment.create(env) + +# Generates input data. +input_data_table = t_env.from_data_stream( + env.from_collection([ + ('Test for tokenization.',), + ('Te,st. punct',), + ], + type_info=Types.ROW_NAMED( + ['input'], + [Types.STRING()]))) + +# Creates a RegexTokenizer object and initializes its parameters. +regex_tokenizer = RegexTokenizer() \ + .set_input_col("input") \ + .set_output_col("output") + +# Uses the Tokenizer object for feature transformations. +output = regex_tokenizer.transform(input_data_table)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(regex_tokenizer.get_input_col())] + output_value = result[field_names.index(regex_tokenizer.get_output_col())] + print('Input Values: ' + str(input_value) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/robustscaler.md b/docs/content.zh/docs/operators/feature/robustscaler.md new file mode 100644 index 000000000..a37085dd6 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/robustscaler.md @@ -0,0 +1,211 @@ +--- +title: "RobustScaler" +weight: 1 +type: docs +aliases: +- /operators/feature/robustscaler.html +--- + + + +## RobustScaler + +RobustScaler is an algorithm that scales features using statistics that are +robust to outliers. + +This Scaler removes the median and scales the data according to the quantile +range (defaults to IQR: Interquartile Range). The IQR is the range between +the 1st quartile (25th quantile) and the 3rd quartile (75th quantile) but can +be configured. + +Centering and scaling happen independently on each feature by computing the +relevant statistics on the samples in the training set. Median and quantile +range are then stored to be used on later data using the transform method. + +Standardization of a dataset is a common requirement for many machine learning +estimators. Typically this is done by removing the mean and scaling to unit +variance. However, outliers can often influence the sample mean / variance +in a negative way. In such cases, the median and the interquartile range +often give better results. + +Note that NaN values are ignored in the computation of medians and ranges. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-----------------------| +| inputCol | Vector | `"input"` | Features to be scaled. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-----------------| +| outputCol | Vector | `"output"` | Scaled features. | + +### Parameters + +Below are the parameters required by `RobustScalerModel`. + +| Key | Default | Type | Required | Description | +|---------------|------------|-------------|----------|-----------------------------------------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| withCentering | `false` | Boolean | no | Whether to center the data with median before scaling. | +| withScaling | `true` | Boolean | no | Whether to scale the data to quantile range. | + +`RobustScaler` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|---------------|--------------|-------------|----------|-----------------------------------------------------------------------| +| lower | `0.25` | Double | no | Lower quantile to calculate quantile range. | +| upper | `0.75` | Double | no | Upper quantile to calculate quantile range. | +| relativeError | `0.001` | Double | no | The relative target precision for the approximate quantile algorithm. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.robustscaler.RobustScaler; +import org.apache.flink.ml.feature.robustscaler.RobustScalerModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a {@link RobustScaler} model and uses it for feature selection. */ +public class RobustScalerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements( + Row.of(1, Vectors.dense(0.0, 0.0)), + Row.of(2, Vectors.dense(1.0, -1.0)), + Row.of(3, Vectors.dense(2.0, -2.0)), + Row.of(4, Vectors.dense(3.0, -3.0)), + Row.of(5, Vectors.dense(4.0, -4.0)), + Row.of(6, Vectors.dense(5.0, -5.0)), + Row.of(7, Vectors.dense(6.0, -6.0)), + Row.of(8, Vectors.dense(7.0, -7.0)), + Row.of(9, Vectors.dense(8.0, -8.0))); + Table trainTable = tEnv.fromDataStream(trainStream).as("id", "input"); + + // Creates a RobustScaler object and initializes its parameters. + RobustScaler robustScaler = + new RobustScaler() + .setLower(0.25) + .setUpper(0.75) + .setRelativeError(0.001) + .setWithScaling(true) + .setWithCentering(true); + + // Trains the RobustScaler model. + RobustScalerModel model = robustScaler.fit(trainTable); + + // Uses the RobustScaler model for predictions. + Table outputTable = model.transform(trainTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector inputValue = (DenseVector) row.getField(robustScaler.getInputCol()); + DenseVector outputValue = (DenseVector) row.getField(robustScaler.getOutputCol()); + System.out.printf("Input Value: %-15s\tOutput Value: %s\n", inputValue, outputValue); + } + } +} +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a RobustScaler instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.table import StreamTableEnvironment + +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo + +from pyflink.ml.feature.robustscaler import RobustScaler + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates input training and prediction data. +train_data = t_env.from_data_stream( + env.from_collection([ + (1, Vectors.dense(0.0, 0.0),), + (2, Vectors.dense(1.0, -1.0),), + (3, Vectors.dense(2.0, -2.0),), + (4, Vectors.dense(3.0, -3.0),), + (5, Vectors.dense(4.0, -4.0),), + (6, Vectors.dense(5.0, -5.0),), + (7, Vectors.dense(6.0, -6.0),), + (8, Vectors.dense(7.0, -7.0),), + (9, Vectors.dense(8.0, -8.0),), + ], + type_info=Types.ROW_NAMED( + ['id', 'input'], + [Types.INT(), DenseVectorTypeInfo()]) + )) + +# Creates an RobustScaler object and initializes its parameters. +robust_scaler = RobustScaler()\ + .set_lower(0.25)\ + .set_upper(0.75)\ + .set_relative_error(0.001)\ + .set_with_scaling(True)\ + .set_with_centering(True) + +# Trains the RobustScaler Model. +model = robust_scaler.fit(train_data) + +# Uses the RobustScaler Model for predictions. +output = model.transform(train_data)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_index = field_names.index(robust_scaler.get_input_col()) + output_index = field_names.index(robust_scaler.get_output_col()) + print('Input Value: ' + str(result[input_index]) + + '\tOutput Value: ' + str(result[output_index])) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/sqltransformer.md b/docs/content.zh/docs/operators/feature/sqltransformer.md new file mode 100644 index 000000000..c42920920 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/sqltransformer.md @@ -0,0 +1,142 @@ +--- +title: "SQLTransformer" +weight: 1 +type: docs +aliases: +- /operators/feature/sqltransformer.html +--- + + + +## SQLTransformer + +SQLTransformer implements the transformations that are defined by SQL statement. + +Currently we only support SQL syntax like `SELECT ... FROM __THIS__ ...` where +`__THIS__` represents the input table and cannot be modified. + +The select clause specifies the fields, constants, and expressions to display in +the output. Except the cases described in the note section below, it can be any +select clause that Flink SQL supports. Users can also use Flink SQL built-in +function and UDFs to operate on these selected columns. + +For example, SQLTransformer supports statements like: + +- `SELECT a, a + b AS a_b FROM __THIS__` +- `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5` +- `SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b` + +Note: This operator only generates append-only/insert-only table as its output. +If the output table could possibly contain retract messages(e.g. perform `SELECT +... FROM __THIS__ GROUP BY ...` operation on a table in streaming mode), this +operator would aggregate all changelogs and only output the final state. + +### Parameters + +| Key | Default | Type | Required | Description | +|:----------|:--------|:-------|:---------|:---------------| +| statement | `null` | String | yes | SQL statement. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.ml.feature.sqltransformer.SQLTransformer; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; + +import java.util.Arrays; + +/** Simple program that creates a SQLTransformer instance and uses it for feature engineering. */ +public class SQLTransformerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromCollection( + Arrays.asList(Row.of(0, 1.0, 3.0), Row.of(2, 2.0, 5.0)), + new RowTypeInfo(Types.INT, Types.DOUBLE, Types.DOUBLE)); + Table inputTable = tEnv.fromDataStream(inputStream).as("id", "v1", "v2"); + + // Creates a SQLTransformer object and initializes its parameters. + SQLTransformer sqlTransformer = + new SQLTransformer() + .setStatement("SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"); + + // Uses the SQLTransformer object for feature transformations. + Table outputTable = sqlTransformer.transform(inputTable)[0]; + + // Extracts and displays the results. + outputTable.execute().print(); + } +} +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a SQLTransformer instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.sqltransformer import SQLTransformer +from pyflink.table import StreamTableEnvironment + +env = StreamExecutionEnvironment.get_execution_environment() + +t_env = StreamTableEnvironment.create(env) + +# Generates input data. +input_data_table = t_env.from_data_stream( + env.from_collection([ + (0, 1.0, 3.0), + (2, 2.0, 5.0), + ], + type_info=Types.ROW_NAMED( + ['id', 'v1', 'v2'], + [Types.INT(), Types.DOUBLE(), Types.DOUBLE()]))) + +# Creates a SQLTransformer object and initializes its parameters. +sql_transformer = SQLTransformer() \ + .set_statement('SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__') + +# Uses the SQLTransformer object for feature transformations. +output_table = sql_transformer.transform(input_data_table)[0] + +# Extracts and displays the results. +output_table.execute().print() +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/standardscaler.md b/docs/content.zh/docs/operators/feature/standardscaler.md new file mode 100644 index 000000000..55879d35c --- /dev/null +++ b/docs/content.zh/docs/operators/feature/standardscaler.md @@ -0,0 +1,158 @@ +--- +title: "StandardScaler" +weight: 1 +type: docs +aliases: +- /operators/feature/standardscaler.html +--- + + + +## StandardScaler + +StandardScaler is an algorithm that standardizes the input features by removing +the mean and scaling each dimension to unit variance. +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-----------------------| +| inputCol | Vector | `"input"` | Features to be scaled. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-----------------| +| outputCol | Vector | `"output"` | Scaled features. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-----------|------------|---------|----------|----------------------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| withMean | `false` | Boolean | no | Whether centers the data with mean before scaling. | +| withStd | `true` | Boolean | no | Whether scales the data with standard deviation. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.standardscaler.StandardScaler; +import org.apache.flink.ml.feature.standardscaler.StandardScalerModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a StandardScaler model and uses it for feature engineering. */ +public class StandardScalerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(Vectors.dense(-2.5, 9, 1)), + Row.of(Vectors.dense(1.4, -5, 1)), + Row.of(Vectors.dense(2, -1, -2))); + Table inputTable = tEnv.fromDataStream(inputStream).as("input"); + + // Creates a StandardScaler object and initializes its parameters. + StandardScaler standardScaler = new StandardScaler(); + + // Trains the StandardScaler Model. + StandardScalerModel model = standardScaler.fit(inputTable); + + // Uses the StandardScaler Model for predictions. + Table outputTable = model.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector inputValue = (DenseVector) row.getField(standardScaler.getInputCol()); + DenseVector outputValue = (DenseVector) row.getField(standardScaler.getOutputCol()); + System.out.printf("Input Value: %s\tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a StandardScaler model and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.standardscaler import StandardScaler +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(-2.5, 9, 1),), + (Vectors.dense(1.4, -5, 1),), + (Vectors.dense(2, -1, -2),), + ], + type_info=Types.ROW_NAMED( + ['input'], + [DenseVectorTypeInfo()]) + )) + +# create a standard-scaler object and initialize its parameters +standard_scaler = StandardScaler() + +# train the standard-scaler model +model = standard_scaler.fit(input_data) + +# use the standard-scaler model for predictions +output = model.transform(input_data)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(standard_scaler.get_input_col())] + output_value = result[field_names.index(standard_scaler.get_output_col())] + print('Input Value: ' + str(input_value) + ' \tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/stopwordsremover.md b/docs/content.zh/docs/operators/feature/stopwordsremover.md new file mode 100644 index 000000000..d7e53ef80 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/stopwordsremover.md @@ -0,0 +1,165 @@ +--- +title: "StopWordsRemover" +weight: 1 +type: docs +aliases: +- /operators/feature/stopwordsremover.html +--- + + + +## StopWordsRemover + +A feature transformer that filters out stop words from input. + +Note: null values from input array are preserved unless adding null to stopWords +explicitly. + +See Also: Stop words +(Wikipedia) + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:---------|:--------|:---------------------------------------------------| +| inputCols | String[] | `null` | Arrays of strings containing stop words to remove. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:---------|:--------|:-------------------------------------------| +| outputCols | String[] | `null` | Arrays of strings with stop words removed. | + +### Parameters + +| Key | Default | Type | Required | Description | +|---------------|----------------------------------------------------|----------|----------|----------------------------------------------------------------------------------------| +| inputCols | `null` | String[] | yes | Input column names. | +| outputCols | `null` | String[] | yes | Output column name. | +| stopWords | `StopWordsRemover.loadDefaultStopWords("english")` | String[] | no | The words to be filtered out. | +| caseSensitive | `false` | Boolean | no | Whether to do a case-sensitive comparison over the stop words. | +| locale | `StopWordsRemover.getDefaultOrUS().toString()` | String | no | Locale of the input for case insensitive matching. Ignored when caseSensitive is true. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.stopwordsremover.StopWordsRemover; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that creates a StopWordsRemover instance and uses it for feature engineering. */ +public class StopWordsRemoverExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of((Object) new String[] {"test", "test"}), + Row.of((Object) new String[] {"a", "b", "c", "d"}), + Row.of((Object) new String[] {"a", "the", "an"}), + Row.of((Object) new String[] {"A", "The", "AN"}), + Row.of((Object) new String[] {null}), + Row.of((Object) new String[] {})); + Table inputTable = tEnv.fromDataStream(inputStream).as("input"); + + // Creates a StopWordsRemover object and initializes its parameters. + StopWordsRemover remover = + new StopWordsRemover().setInputCols("input").setOutputCols("output"); + + // Uses the StopWordsRemover object for feature transformations. + Table outputTable = remover.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + String[] inputValues = row.getFieldAs("input"); + String[] outputValues = row.getFieldAs("output"); + + System.out.printf( + "Input Values: %s\tOutput Values: %s\n", + Arrays.toString(inputValues), Arrays.toString(outputValues)); + } + } +} +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a StopWordsRemover instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.stopwordsremover import StopWordsRemover +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_table = t_env.from_data_stream( + env.from_collection([ + (["test", "test"],), + (["a", "b", "c", "d"],), + (["a", "the", "an"],), + (["A", "The", "AN"],), + ([None],), + ([],), + ], + type_info=Types.ROW_NAMED( + ['input'], + [Types.OBJECT_ARRAY(Types.STRING())]))) + +# create a StopWordsRemover object and initialize its parameters +remover = StopWordsRemover().set_input_cols('input').set_output_cols('output') + +# use the StopWordsRemover for feature engineering +output_table = remover.transform(input_table)[0] + +# extract and display the results +field_names = output_table.get_schema().get_field_names() +for result in t_env.to_data_stream(output_table).execute_and_collect(): + input_value = result[field_names.index('input')] + output_value = result[field_names.index('output')] + print('Input Value: ' + str(input_value) + '\tOutput Value: ' + str(output_value)) +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/stringindexer.md b/docs/content.zh/docs/operators/feature/stringindexer.md new file mode 100644 index 000000000..110f80019 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/stringindexer.md @@ -0,0 +1,219 @@ +--- +title: "StringIndexer" +weight: 1 +type: docs +aliases: +- /operators/feature/stringindexer.html +--- + + + +## StringIndexer + +StringIndexer maps one or more columns (string/numerical value) of the input to +one or more indexed output columns (integer value). The output indices of two +data points are the same iff their corresponding input columns are the same. The +indices are in [0, numDistinctValuesInThisColumn]. + +IndexToStringModel transforms input index column(s) to string column(s) using +the model data computed by StringIndexer. It is a reverse operation of +StringIndexerModel. +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:--------------|:--------|:---------------------------------------| +| inputCols | Number/String | `null` | String/Numerical values to be indexed. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:--------|:------------------------------------| +| outputCols | Double | `null` | Indices of string/numerical values. | + +### Parameters + +Below are the parameters required by `StringIndexerModel`. + +| Key | Default | Type | Required | Description | +|---------------|-----------|----------|----------|--------------------------------------------------------------------------------| +| inputCols | `null` | String[] | yes | Input column names. | +| outputCols | `null` | String[] | yes | Output column names. | +| handleInvalid | `"error"` | String | no | Strategy to handle invalid entries. Supported values: 'error', 'skip', 'keep'. | + +`StringIndexer` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|-----------------|---------------|---------|----------|-------------------------------------------------------------------------------------------------------------------------------------| +| stringOrderType | `"arbitrary"` | String | no | How to order strings of each column. Supported values: 'arbitrary', 'frequencyDesc', 'frequencyAsc', 'alphabetDesc', 'alphabetAsc'. | +| MaxIndexNum | `2147483647` | Integer | no | The max number of indices for each column. It only works when 'stringOrderType' is set as 'frequencyDesc'. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.stringindexer.StringIndexer; +import org.apache.flink.ml.feature.stringindexer.StringIndexerModel; +import org.apache.flink.ml.feature.stringindexer.StringIndexerParams; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that trains a StringIndexer model and uses it for feature engineering. */ +public class StringIndexerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements( + Row.of("a", 1.0), + Row.of("b", 1.0), + Row.of("b", 2.0), + Row.of("c", 0.0), + Row.of("d", 2.0), + Row.of("a", 2.0), + Row.of("b", 2.0), + Row.of("b", -1.0), + Row.of("a", -1.0), + Row.of("c", -1.0)); + Table trainTable = tEnv.fromDataStream(trainStream).as("inputCol1", "inputCol2"); + + DataStream predictStream = + env.fromElements(Row.of("a", 2.0), Row.of("b", 1.0), Row.of("c", 2.0)); + Table predictTable = tEnv.fromDataStream(predictStream).as("inputCol1", "inputCol2"); + + // Creates a StringIndexer object and initializes its parameters. + StringIndexer stringIndexer = + new StringIndexer() + .setStringOrderType(StringIndexerParams.ALPHABET_ASC_ORDER) + .setInputCols("inputCol1", "inputCol2") + .setOutputCols("outputCol1", "outputCol2"); + + // Trains the StringIndexer Model. + StringIndexerModel model = stringIndexer.fit(trainTable); + + // Uses the StringIndexer Model for predictions. + Table outputTable = model.transform(predictTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + Object[] inputValues = new Object[stringIndexer.getInputCols().length]; + double[] outputValues = new double[stringIndexer.getInputCols().length]; + for (int i = 0; i < inputValues.length; i++) { + inputValues[i] = row.getField(stringIndexer.getInputCols()[i]); + outputValues[i] = (double) row.getField(stringIndexer.getOutputCols()[i]); + } + + System.out.printf( + "Input Values: %s \tOutput Values: %s\n", + Arrays.toString(inputValues), Arrays.toString(outputValues)); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a StringIndexer model and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.stringindexer import StringIndexer +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input training and prediction data +train_table = t_env.from_data_stream( + env.from_collection([ + ('a', 1.), + ('b', 1.), + ('b', 2.), + ('c', 0.), + ('d', 2.), + ('a', 2.), + ('b', 2.), + ('b', -1.), + ('a', -1.), + ('c', -1.), + ], + type_info=Types.ROW_NAMED( + ['input_col1', 'input_col2'], + [Types.STRING(), Types.DOUBLE()]) + )) + +predict_table = t_env.from_data_stream( + env.from_collection([ + ('a', 2.), + ('b', 1.), + ('c', 2.), + ], + type_info=Types.ROW_NAMED( + ['input_col1', 'input_col2'], + [Types.STRING(), Types.DOUBLE()]) + )) + +# create a string-indexer object and initialize its parameters +string_indexer = StringIndexer() \ + .set_string_order_type('alphabetAsc') \ + .set_input_cols('input_col1', 'input_col2') \ + .set_output_cols('output_col1', 'output_col2') + +# train the string-indexer model +model = string_indexer.fit(train_table) + +# use the string-indexer model for feature engineering +output = model.transform(predict_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +input_values = [None for _ in string_indexer.get_input_cols()] +output_values = [None for _ in string_indexer.get_input_cols()] +for result in t_env.to_data_stream(output).execute_and_collect(): + for i in range(len(string_indexer.get_input_cols())): + input_values[i] = result[field_names.index(string_indexer.get_input_cols()[i])] + output_values[i] = result[field_names.index(string_indexer.get_output_cols()[i])] + print('Input Values: ' + str(input_values) + '\tOutput Values: ' + str(output_values)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/tokenizer.md b/docs/content.zh/docs/operators/feature/tokenizer.md new file mode 100644 index 000000000..8a3d33817 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/tokenizer.md @@ -0,0 +1,148 @@ +--- +title: "Tokenizer" +weight: 1 +type: docs +aliases: +- /operators/feature/tokenizer.html +--- + + + +## Tokenizer + +Tokenizer is an algorithm that converts the input string +to lowercase and then splits it by white spaces. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-------------------------| +| inputCol | String | `"input"` | Strings to be tokenized. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:---------|:-----------|:-------------------| +| outputCol | String[] | `"output"` | Tokenized strings. | + +### Parameters + +| Key | Default | Type | Required | Description | +|:----------|:-----------|:-------|:---------|:--------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java + +import org.apache.flink.ml.feature.tokenizer.Tokenizer; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that creates a Tokenizer instance and uses it for feature engineering. */ +public class TokenizerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements(Row.of("Test for tokenization."), Row.of("Te,st. punct")); + Table inputTable = tEnv.fromDataStream(inputStream).as("input"); + + // Creates a Tokenizer object and initializes its parameters. + Tokenizer tokenizer = new Tokenizer().setInputCol("input").setOutputCol("output"); + + // Uses the Tokenizer object for feature transformations. + Table outputTable = tokenizer.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + String inputValue = (String) row.getField(tokenizer.getInputCol()); + String[] outputValues = (String[]) row.getField(tokenizer.getOutputCol()); + + System.out.printf( + "Input Value: %s \tOutput Values: %s\n", + inputValue, Arrays.toString(outputValues)); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a Tokenizer instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.tokenizer import Tokenizer +from pyflink.table import StreamTableEnvironment + +env = StreamExecutionEnvironment.get_execution_environment() + +t_env = StreamTableEnvironment.create(env) + +# Generates input data. +input_data_table = t_env.from_data_stream( + env.from_collection([ + ('Test for tokenization.',), + ('Te,st. punct',), + ], + type_info=Types.ROW_NAMED( + ['input'], + [Types.STRING()]))) + +# Creates a Tokenizer object and initializes its parameters. +tokenizer = Tokenizer() \ + .set_input_col("input") \ + .set_output_col("output") + +# Uses the Tokenizer object for feature transformations. +output = tokenizer.transform(input_data_table)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(tokenizer.get_input_col())] + output_value = result[field_names.index(tokenizer.get_output_col())] + print('Input Value: ' + str(input_value) + '\tOutput Values: ' + str(output_value)) +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/univariatefeatureselector.md b/docs/content.zh/docs/operators/feature/univariatefeatureselector.md new file mode 100644 index 000000000..c12288c2e --- /dev/null +++ b/docs/content.zh/docs/operators/feature/univariatefeatureselector.md @@ -0,0 +1,225 @@ +--- +title: "UnivariateFeatureSelector" +weight: 1 +type: docs +aliases: +- /operators/feature/univariatefeatureselector.html +--- + + + +## UnivariateFeatureSelector +UnivariateFeatureSelector is an algorithm that selects features based on +univariate statistical tests against labels. + +Currently, Flink supports three UnivariateFeatureSelectors: chi-squared, +ANOVA F-test and F-value. User can choose UnivariateFeatureSelector by +setting `featureType` and `labelType`, and Flink will pick the score function +based on the specified `featureType` and `labelType`. + +The following combination of `featureType` and `labelType` are supported: + +

    +
  • `featureType` `categorical` and `labelType` `categorical`: Flink uses + chi-squared, i.e. chi2 in sklearn. +
  • `featureType` `continuous` and `labelType` `categorical`: Flink uses + ANOVA F-test, i.e. f_classif in sklearn. +
  • `featureType` `continuous` and `labelType` `continuous`: Flink uses + F-value, i.e. f_regression in sklearn. +
+ +UnivariateFeatureSelector supports different selection modes: + +
    +
  • numTopFeatures: chooses a fixed number of top features according to a + hypothesis. +
  • percentile: similar to numTopFeatures but chooses a fraction of all + features instead of a fixed number. +
  • fpr: chooses all features whose p-value are below a threshold, thus + controlling the false positive rate of selection. +
  • fdr: uses the Benjamini-Hochberg procedure to + choose all features whose false discovery rate is below a threshold. +
  • fwe: chooses all features whose p-values are below a threshold. The + threshold is scaled by 1/numFeatures, thus controlling the family-wise + error rate of selection. +
+ +By default, the selection mode is `numTopFeatures`. + +### Input Columns + +| Param name | Type | Default | Description | +|:------------|:-------|:-------------|:-----------------------| +| featuresCol | Vector | `"features"` | Feature vector. | +| labelCol | Number | `"label"` | Label of the features. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-------------------| +| outputCol | Vector | `"output"` | Selected features. | + +### Parameters + +Below are the parameters required by `UnivariateFeatureSelectorModel`. + +| Key | Default | Type | Required | Description | +|-------------|--------------|--------|----------|-------------------------| +| featuresCol | `"features"` | String | no | Features column name. | +| outputCol | `"output"` | String | no | Output column name. | + +`UnivariateFeatureSelector` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +| ------------------ | ------------------ | ------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| labelCol | `"label"` | String | no | Label column name. | +| featureType | `null` | String | yes | The feature type. Supported values: 'categorical', 'continuous'. | +| labelType | `null` | String | yes | The label type. Supported values: 'categorical', 'continuous'. | +| selectionMode | `"numTopFeatures"` | String | no | The feature selection mode. Supported values: 'numTopFeatures', 'percentile', 'fpr', 'fdr', 'fwe'. | +| selectionThreshold | `null` | Number | no | The upper bound of the features that selector will select. If not set, it will be replaced with a meaningful value according to different selection modes at runtime. When the mode is numTopFeatures, it will be replaced with 50; when the mode is percentile, it will be replaced with 0.1; otherwise, it will be replaced with 0.05. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.univariatefeatureselector.UnivariateFeatureSelector; +import org.apache.flink.ml.feature.univariatefeatureselector.UnivariateFeatureSelectorModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** + * Simple program that trains a {@link UnivariateFeatureSelector} model and uses it for feature + * selection. + */ +public class UnivariateFeatureSelectorExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements( + Row.of(Vectors.dense(1.7, 4.4, 7.6, 5.8, 9.6, 2.3), 3.0), + Row.of(Vectors.dense(8.8, 7.3, 5.7, 7.3, 2.2, 4.1), 2.0), + Row.of(Vectors.dense(1.2, 9.5, 2.5, 3.1, 8.7, 2.5), 1.0), + Row.of(Vectors.dense(3.7, 9.2, 6.1, 4.1, 7.5, 3.8), 2.0), + Row.of(Vectors.dense(8.9, 5.2, 7.8, 8.3, 5.2, 3.0), 4.0), + Row.of(Vectors.dense(7.9, 8.5, 9.2, 4.0, 9.4, 2.1), 4.0)); + Table trainTable = tEnv.fromDataStream(trainStream).as("features", "label"); + + // Creates a UnivariateFeatureSelector object and initializes its parameters. + UnivariateFeatureSelector univariateFeatureSelector = + new UnivariateFeatureSelector() + .setFeaturesCol("features") + .setLabelCol("label") + .setFeatureType("continuous") + .setLabelType("categorical") + .setSelectionThreshold(1); + + // Trains the UnivariateFeatureSelector model. + UnivariateFeatureSelectorModel model = univariateFeatureSelector.fit(trainTable); + + // Uses the UnivariateFeatureSelector model for predictions. + Table outputTable = model.transform(trainTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector inputValue = + (DenseVector) row.getField(univariateFeatureSelector.getFeaturesCol()); + DenseVector outputValue = + (DenseVector) row.getField(univariateFeatureSelector.getOutputCol()); + System.out.printf("Input Value: %-15s\tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a UnivariateFeatureSelector instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.univariatefeatureselector import UnivariateFeatureSelector +from pyflink.table import StreamTableEnvironment + +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo + +env = StreamExecutionEnvironment.get_execution_environment() + +t_env = StreamTableEnvironment.create(env) + +# Generates input training and prediction data. +input_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(1.7, 4.4, 7.6, 5.8, 9.6, 2.3), 3.0,), + (Vectors.dense(8.8, 7.3, 5.7, 7.3, 2.2, 4.1), 2.0,), + (Vectors.dense(1.2, 9.5, 2.5, 3.1, 8.7, 2.5), 1.0,), + (Vectors.dense(3.7, 9.2, 6.1, 4.1, 7.5, 3.8), 2.0,), + (Vectors.dense(8.9, 5.2, 7.8, 8.3, 5.2, 3.0), 4.0,), + (Vectors.dense(7.9, 8.5, 9.2, 4.0, 9.4, 2.1), 4.0,), + ], + type_info=Types.ROW_NAMED( + ['features', 'label'], + [DenseVectorTypeInfo(), Types.FLOAT()]) + )) + +# Creates an UnivariateFeatureSelector object and initializes its parameters. +univariate_feature_selector = UnivariateFeatureSelector() \ + .set_features_col('features') \ + .set_label_col('label') \ + .set_feature_type('continuous') \ + .set_label_type('categorical') \ + .set_selection_threshold(1) + +# Trains the UnivariateFeatureSelector Model. +model = univariate_feature_selector.fit(input_table) + +# Uses the UnivariateFeatureSelector Model for predictions. +output = model.transform(input_table)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_index = field_names.index(univariate_feature_selector.get_features_col()) + output_index = field_names.index(univariate_feature_selector.get_output_col()) + print('Input Value: ' + str(result[input_index]) + + '\tOutput Value: ' + str(result[output_index])) + +``` +{{< /tab >}} + +{{< /tab>}} \ No newline at end of file diff --git a/docs/content.zh/docs/operators/feature/variancethresholdselector.md b/docs/content.zh/docs/operators/feature/variancethresholdselector.md new file mode 100644 index 000000000..5cd9a2f14 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/variancethresholdselector.md @@ -0,0 +1,189 @@ +--- +title: "VarianceThresholdSelector" +weight: 1 +type: docs +aliases: +- /operators/feature/variancethresholdselector.html +--- + + + +## VarianceThresholdSelector + +VarianceThresholdSelector is a selector that removes low-variance features. +Features with a variance not greater than the varianceThreshold will be removed. +If not set, varianceThreshold defaults to 0, which means only features with +variance 0 (i.e. features that have the same value in all samples) will be removed. + +### Input Columns + +| Param name | Type | Default | Description | +|:------------|:-------|:----------|:----------------| +| inputCol | Vector | `"input"` | Input features. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-----------------| +| outputCol | Vector | `"output"` | Scaled features. | + +### Parameters + +Below are the parameters required by `VarianceThresholdSelectorModel`. + +| Key | Default | Type | Required | Description | +|------------|------------|--------|----------|-----------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | + +`VarianceThresholdSelector` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|-------------------|--------------|--------|----------|---------------------------------------------------------------------------| +| varianceThreshold | `0.0` | Double | no | Features with a variance not greater than this threshold will be removed. | + + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.variancethresholdselector.VarianceThresholdSelector; +import org.apache.flink.ml.feature.variancethresholdselector.VarianceThresholdSelectorModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** + * Simple program that trains a {@link VarianceThresholdSelector} model and uses it for feature + * selection. + */ +public class VarianceThresholdSelectorExample { + + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input training and prediction data. + DataStream trainStream = + env.fromElements( + Row.of(1, Vectors.dense(5.0, 7.0, 0.0, 7.0, 6.0, 0.0)), + Row.of(2, Vectors.dense(0.0, 9.0, 6.0, 0.0, 5.0, 9.0)), + Row.of(3, Vectors.dense(0.0, 9.0, 3.0, 0.0, 5.0, 5.0)), + Row.of(4, Vectors.dense(1.0, 9.0, 8.0, 5.0, 7.0, 4.0)), + Row.of(5, Vectors.dense(9.0, 8.0, 6.0, 5.0, 4.0, 4.0)), + Row.of(6, Vectors.dense(6.0, 9.0, 7.0, 0.0, 2.0, 0.0))); + Table trainTable = tEnv.fromDataStream(trainStream).as("id", "input"); + + // Create a VarianceThresholdSelector object and initialize its parameters + double threshold = 8.0; + VarianceThresholdSelector varianceThresholdSelector = + new VarianceThresholdSelector() + .setVarianceThreshold(threshold) + .setInputCol("input"); + + // Train the VarianceThresholdSelector model. + VarianceThresholdSelectorModel model = varianceThresholdSelector.fit(trainTable); + + // Uses the VarianceThresholdSelector model for predictions. + Table outputTable = model.transform(trainTable)[0]; + + // Extracts and displays the results. + System.out.printf("Variance Threshold: %s\n", threshold); + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector inputValue = + (DenseVector) row.getField(varianceThresholdSelector.getInputCol()); + DenseVector outputValue = + (DenseVector) row.getField(varianceThresholdSelector.getOutputCol()); + System.out.printf("Input Values: %-15s\tOutput Values: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a VarianceThresholdSelector model and uses it for feature +# selection. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.variancethresholdselector import VarianceThresholdSelector +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input training and prediction data +train_data = t_env.from_data_stream( + env.from_collection([ + (1, Vectors.dense(5.0, 7.0, 0.0, 7.0, 6.0, 0.0),), + (2, Vectors.dense(0.0, 9.0, 6.0, 0.0, 5.0, 9.0),), + (3, Vectors.dense(0.0, 9.0, 3.0, 0.0, 5.0, 5.0),), + (4, Vectors.dense(1.0, 9.0, 8.0, 5.0, 7.0, 4.0),), + (5, Vectors.dense(9.0, 8.0, 6.0, 5.0, 4.0, 4.0),), + (6, Vectors.dense(6.0, 9.0, 7.0, 0.0, 2.0, 0.0),), + ], + type_info=Types.ROW_NAMED( + ['id', 'input'], + [Types.INT(), DenseVectorTypeInfo()]) + )) + +# create a VarianceThresholdSelector object and initialize its parameters +threshold = 8.0 +variance_thread_selector = VarianceThresholdSelector()\ + .set_input_col("input")\ + .set_variance_threshold(threshold) + +# train the VarianceThresholdSelector model +model = variance_thread_selector.fit(train_data) + +# use the VarianceThresholdSelector model for predictions +output = model.transform(train_data)[0] + +# extract and display the results +print("Variance Threshold: " + str(threshold)) +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(variance_thread_selector.get_input_col())] + output_value = result[field_names.index(variance_thread_selector.get_output_col())] + print('Input Values: ' + str(input_value) + ' \tOutput Values: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/vectorassembler.md b/docs/content.zh/docs/operators/feature/vectorassembler.md new file mode 100644 index 000000000..2877e419c --- /dev/null +++ b/docs/content.zh/docs/operators/feature/vectorassembler.md @@ -0,0 +1,193 @@ +--- +title: "VectorAssembler" +weight: 1 +type: docs +aliases: +- /operators/feature/vectorassembler.html +--- + + + +## VectorAssembler +A Transformer which combines a given list of input columns into a vector column. Input columns +would be numerical or vectors whose sizes are specified by the {@link #INPUT_SIZES} parameter. +Invalid input data with null values or values with wrong sizes would be dealt with according to +the strategy specified by the {@link HasHandleInvalid} parameter as follows: +
    +
  • keep: If the input column data is null, a vector would be created with the specified size + and NaN values. The vector would be used in the assembling process to represent the input + column data. If the input column data is a vector, the data would be used in the assembling + process even if it has a wrong size. +
  • skip: If the input column data is null or a vector with wrong size, the input row would be + filtered out and not be sent to downstream operators. +
  • error: If the input column data is null or a vector with wrong size, an exception would be + thrown. +
+### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:--------------|:--------|:--------------------------------| +| inputCols | Number/Vector | `null` | Number/Vectors to be assembled. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:------------------| +| outputCol | Vector | `"output"` | Assembled vector. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-----------------|------------|-----------|----------|--------------------------------------------------------------------------------| +| inputCols | `null` | String[] | yes | Input column names. | +| outputCol | `"output"` | String | no | Output column name. | +| inputSizes | `null` | Integer[] | yes | Sizes of the input elements to be assembled. | +| handleInvalid | `"error"` | String | no | Strategy to handle invalid entries. Supported values: 'error', 'skip', 'keep'. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.vectorassembler.VectorAssembler; +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; + +/** Simple program that creates a VectorAssembler instance and uses it for feature engineering. */ +public class VectorAssemblerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of( + Vectors.dense(2.1, 3.1), + 1.0, + Vectors.sparse(5, new int[] {3}, new double[] {1.0})), + Row.of( + Vectors.dense(2.1, 3.1), + 1.0, + Vectors.sparse( + 5, + new int[] {4, 2, 3, 1}, + new double[] {4.0, 2.0, 3.0, 1.0}))); + Table inputTable = tEnv.fromDataStream(inputStream).as("vec", "num", "sparseVec"); + + // Creates a VectorAssembler object and initializes its parameters. + VectorAssembler vectorAssembler = + new VectorAssembler() + .setInputCols("vec", "num", "sparseVec") + .setOutputCol("assembledVec") + .setInputSizes(2, 1, 5); + + // Uses the VectorAssembler object for feature transformations. + Table outputTable = vectorAssembler.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + Object[] inputValues = new Object[vectorAssembler.getInputCols().length]; + for (int i = 0; i < inputValues.length; i++) { + inputValues[i] = row.getField(vectorAssembler.getInputCols()[i]); + } + + Vector outputValue = (Vector) row.getField(vectorAssembler.getOutputCol()); + + System.out.printf( + "Input Values: %s \tOutput Value: %s\n", + Arrays.toString(inputValues), outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a VectorAssembler instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo, SparseVectorTypeInfo +from pyflink.ml.feature.vectorassembler import VectorAssembler +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(2.1, 3.1), + 1.0, + Vectors.sparse(5, [3], [1.0])), + (Vectors.dense(2.1, 3.1), + 1.0, + Vectors.sparse(5, [1, 2, 3, 4], + [1.0, 2.0, 3.0, 4.0])), + ], + type_info=Types.ROW_NAMED( + ['vec', 'num', 'sparse_vec'], + [DenseVectorTypeInfo(), Types.DOUBLE(), SparseVectorTypeInfo()]))) + +# create a vector assembler object and initialize its parameters +vector_assembler = VectorAssembler() \ + .set_input_cols('vec', 'num', 'sparse_vec') \ + .set_output_col('assembled_vec') \ + .set_input_sizes(2, 1, 5) \ + .set_handle_invalid('keep') + +# use the vector assembler for feature engineering +output = vector_assembler.transform(input_data_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +input_values = [None for _ in vector_assembler.get_input_cols()] +for result in t_env.to_data_stream(output).execute_and_collect(): + for i in range(len(vector_assembler.get_input_cols())): + input_values[i] = result[field_names.index(vector_assembler.get_input_cols()[i])] + output_value = result[field_names.index(vector_assembler.get_output_col())] + print('Input Values: ' + str(input_values) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/vectorindexer.md b/docs/content.zh/docs/operators/feature/vectorindexer.md new file mode 100644 index 000000000..c54953ffc --- /dev/null +++ b/docs/content.zh/docs/operators/feature/vectorindexer.md @@ -0,0 +1,210 @@ +--- +title: "VectorIndexer" +weight: 1 +type: docs +aliases: +- /operators/feature/vectorindexer.html +--- + + + +## VectorIndexer + +VectorIndexer is an algorithm that implements the vector +indexing algorithm. A vector indexer maps each column of +the input vector into a continuous/categorical feature. +Whether one feature is transformed into a continuous or +categorical feature depends on the number of distinct +values in this column. If the number of distinct values +in one column is greater than a specified parameter +(i.e., maxCategories), the corresponding output column +is unchanged. Otherwise, it is transformed into a +categorical value. For categorical outputs, the indices +are in [0, numDistinctValuesInThisColumn]. + +The output model is organized in ascending order except +that 0.0 is always mapped to 0 (for sparsity). + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:-----------------------| +| inputCol | Vector | `"input"` | Vectors to be indexed. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-----------------| +| outputCol | Vector | `"output"` | Indexed vectors. | + +### Parameters + +Below are the parameters required by `VectorIndexerModel`. + +| Key | Default | Type | Required | Description | +|:--------------|:-----------|:-------|:---------|:---------------------------------------------------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| handleInvalid | `"error"` | String | no | Strategy to handle invalid entries. Supported values: `'error', 'skip', 'keep'`. | + +`VectorIndexer` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +|:--------------|:--------|:--------|:---------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| maxCategories | `20` | Integer | no | Threshold for the number of values a categorical feature can take (>= 2). If a feature is found to have > maxCategories values, then it is declared continuous. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.common.param.HasHandleInvalid; +import org.apache.flink.ml.feature.vectorindexer.VectorIndexer; +import org.apache.flink.ml.feature.vectorindexer.VectorIndexerModel; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; +import java.util.List; + +/** Simple program that creates a VectorIndexer instance and uses it for feature engineering. */ +public class VectorIndexerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + List trainInput = + Arrays.asList( + Row.of(Vectors.dense(1, 1)), + Row.of(Vectors.dense(2, -1)), + Row.of(Vectors.dense(3, 1)), + Row.of(Vectors.dense(4, 0)), + Row.of(Vectors.dense(5, 0))); + + List predictInput = + Arrays.asList( + Row.of(Vectors.dense(0, 2)), + Row.of(Vectors.dense(0, 0)), + Row.of(Vectors.dense(0, -1))); + + Table trainTable = tEnv.fromDataStream(env.fromCollection(trainInput)).as("input"); + Table predictTable = tEnv.fromDataStream(env.fromCollection(predictInput)).as("input"); + + // Creates a VectorIndexer object and initializes its parameters. + VectorIndexer vectorIndexer = + new VectorIndexer() + .setInputCol("input") + .setOutputCol("output") + .setHandleInvalid(HasHandleInvalid.KEEP_INVALID) + .setMaxCategories(3); + + // Trains the VectorIndexer Model. + VectorIndexerModel model = vectorIndexer.fit(trainTable); + + // Uses the VectorIndexer Model for predictions. + Table outputTable = model.transform(predictTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + System.out.printf( + "Input Value: %s \tOutput Value: %s\n", + row.getField(vectorIndexer.getInputCol()), + row.getField(vectorIndexer.getOutputCol())); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a VectorIndexer model and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.feature.vectorindexer import VectorIndexer +from pyflink.table import StreamTableEnvironment + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates input training and prediction data. +train_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(1, 1),), + (Vectors.dense(2, -1),), + (Vectors.dense(3, 1),), + (Vectors.dense(4, 0),), + (Vectors.dense(5, 0),) + ], + type_info=Types.ROW_NAMED( + ['input', ], + [DenseVectorTypeInfo(), ]))) + +predict_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(0, 2),), + (Vectors.dense(0, 0),), + (Vectors.dense(0, -1),), + ], + type_info=Types.ROW_NAMED( + ['input', ], + [DenseVectorTypeInfo(), ]))) + +# Creates a VectorIndexer object and initializes its parameters. +vector_indexer = VectorIndexer() \ + .set_input_col('input') \ + .set_output_col('output') \ + .set_handle_invalid('keep') \ + .set_max_categories(3) + +# Trains the VectorIndexer Model. +model = vector_indexer.fit(train_table) + +# Uses the VectorIndexer Model for predictions. +output = model.transform(predict_table)[0] + +# Extracts and displays the results. +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + print('Input Value: ' + str(result[field_names.index(vector_indexer.get_input_col())]) + + '\tOutput Value: ' + str(result[field_names.index(vector_indexer.get_output_col())])) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/feature/vectorslicer.md b/docs/content.zh/docs/operators/feature/vectorslicer.md new file mode 100644 index 000000000..c7965e0c0 --- /dev/null +++ b/docs/content.zh/docs/operators/feature/vectorslicer.md @@ -0,0 +1,158 @@ +--- +title: "VectorSlicer" +weight: 1 +type: docs +aliases: +- /operators/feature/vectorslicer.html +--- + + + +## VectorSlicer + +VectorSlicer transforms a vector to a new feature, which is a sub-array of the original +feature. It is useful for extracting features from a given vector. + +Note that duplicate features are not allowed, so there can be no overlap between selected +indices. If the max value of the indices is greater than the size of the input vector, +it throws an IllegalArgumentException. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:----------|:---------------------| +| inputCol | Vector | `"input"` | Vector to be sliced. | + +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:---------------| +| outputCol | Vector | `"output"` | Sliced vector. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-----------|------------|-----------|----------|---------------------------------------------------------------| +| inputCol | `"input"` | String | no | Input column name. | +| outputCol | `"output"` | String | no | Output column name. | +| indices | `null` | Integer[] | yes | An array of indices to select features from a vector column. | +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.feature.vectorslicer.VectorSlicer; +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that creates a VectorSlicer instance and uses it for feature engineering. */ +public class VectorSlicerExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(Vectors.dense(2.1, 3.1, 1.2, 3.1, 4.6)), + Row.of(Vectors.dense(1.2, 3.1, 4.6, 2.1, 3.1))); + Table inputTable = tEnv.fromDataStream(inputStream).as("vec"); + + // Creates a VectorSlicer object and initializes its parameters. + VectorSlicer vectorSlicer = + new VectorSlicer().setInputCol("vec").setIndices(1, 2, 3).setOutputCol("slicedVec"); + + // Uses the VectorSlicer object for feature transformations. + Table outputTable = vectorSlicer.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + Vector inputValue = (Vector) row.getField(vectorSlicer.getInputCol()); + + Vector outputValue = (Vector) row.getField(vectorSlicer.getOutputCol()); + + System.out.printf("Input Value: %s \tOutput Value: %s\n", inputValue, outputValue); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a VectorSlicer instance and uses it for feature +# engineering. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.feature.vectorslicer import VectorSlicer +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data_table = t_env.from_data_stream( + env.from_collection([ + (1, Vectors.dense(2.1, 3.1, 1.2, 2.1)), + (2, Vectors.dense(2.3, 2.1, 1.3, 1.2)), + ], + type_info=Types.ROW_NAMED( + ['id', 'vec'], + [Types.INT(), DenseVectorTypeInfo()]))) + +# create a vector slicer object and initialize its parameters +vector_slicer = VectorSlicer() \ + .set_input_col('vec') \ + .set_indices(1, 2, 3) \ + .set_output_col('sub_vec') + +# use the vector slicer model for feature engineering +output = vector_slicer.transform(input_data_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + input_value = result[field_names.index(vector_slicer.get_input_col())] + output_value = result[field_names.index(vector_slicer.get_output_col())] + print('Input Value: ' + str(input_value) + '\tOutput Value: ' + str(output_value)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/functions.md b/docs/content.zh/docs/operators/functions.md new file mode 100644 index 000000000..96968eae8 --- /dev/null +++ b/docs/content.zh/docs/operators/functions.md @@ -0,0 +1,236 @@ +--- +title: "Functions" +type: docs +weight: 2 +aliases: +- /operators/functions.html +--- + + +## Functions + +Flink ML provides users with some built-in table functions for data +transformations. This page gives a brief overview of them. + +### vectorToArray + +This function converts a column of Flink ML sparse/dense vectors into a column +of double arrays. + +{{< tabs vectorToArray_examples >}} + +{{< tab "Java">}} +```java +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.ml.linalg.typeinfo.VectorTypeInfo; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; +import java.util.List; + +import static org.apache.flink.ml.Functions.vectorToArray; +import static org.apache.flink.table.api.Expressions.$; + +/** Simple program that converts a column of dense/sparse vectors into a column of double arrays. */ +public class VectorToArrayExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input vector data. + List vectors = + Arrays.asList( + Vectors.dense(0.0, 0.0), + Vectors.sparse(2, new int[] {1}, new double[] {1.0})); + Table inputTable = + tEnv.fromDataStream(env.fromCollection(vectors, VectorTypeInfo.INSTANCE)) + .as("vector"); + + // Converts each vector to a double array. + Table outputTable = inputTable.select($("vector"), vectorToArray($("vector")).as("array")); + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + Vector vector = row.getFieldAs("vector"); + Double[] doubleArray = row.getFieldAs("array"); + System.out.printf( + "Input vector: %s\tOutput double array: %s\n", + vector, Arrays.toString(doubleArray)); + } + } +} +``` +{{< /tab>}} + +{{< tab "Python">}} +```python +# Simple program that converts a column of dense/sparse vectors into a column of double arrays. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.table import StreamTableEnvironment + +from pyflink.ml.linalg import Vectors, VectorTypeInfo + +from pyflink.ml.functions import vector_to_array +from pyflink.table.expressions import col + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input vector data +vectors = [ + (Vectors.dense(0.0, 0.0),), + (Vectors.sparse(2, [1], [1.0]),), +] +input_table = t_env.from_data_stream( + env.from_collection( + vectors, + type_info=Types.ROW_NAMED( + ['vector'], + [VectorTypeInfo()]) + )) + +# convert each vector to a double array +output_table = input_table.select(vector_to_array(col('vector')).alias('array')) + +# extract and display the results +output_values = [x for x in + t_env.to_data_stream(output_table).map(lambda r: r).execute_and_collect()] + +output_values.sort(key=lambda x: x[0]) + +field_names = output_table.get_schema().get_field_names() +for i in range(len(output_values)): + vector = vectors[i][0] + double_array = output_values[i][field_names.index("array")] + print("Input vector: %s \t output double array: %s" % (vector, double_array)) +``` +{{< /tab>}} + +{{< /tabs>}} + +### arrayToVector + +This function converts a column of arrays of numeric type into a column of +DenseVector instances. + +{{< tabs arrayToVector_examples >}} + +{{< tab "Java">}} +```java +import org.apache.flink.ml.linalg.Vector; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +import java.util.Arrays; +import java.util.List; + +import static org.apache.flink.ml.Functions.arrayToVector; +import static org.apache.flink.table.api.Expressions.$; + +/** Simple program that converts a column of double arrays into a column of dense vectors. */ +public class ArrayToVectorExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input double array data. + List doubleArrays = + Arrays.asList(new double[] {0.0, 0.0}, new double[] {0.0, 1.0}); + Table inputTable = tEnv.fromDataStream(env.fromCollection(doubleArrays)).as("array"); + + // Converts each double array to a dense vector. + Table outputTable = inputTable.select($("array"), arrayToVector($("array")).as("vector")); + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + Double[] doubleArray = row.getFieldAs("array"); + Vector vector = row.getFieldAs("vector"); + System.out.printf( + "Input double array: %s\tOutput vector: %s\n", + Arrays.toString(doubleArray), vector); + } + } +} +``` +{{< /tab>}} + +{{< tab "Python">}} +```python +# Simple program that converts a column of double arrays into a column of dense vectors. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.functions import array_to_vector +from pyflink.table import StreamTableEnvironment +from pyflink.table.expressions import col + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input double array data +double_arrays = [ + ([0.0, 0.0],), + ([0.0, 1.0],), +] +input_table = t_env.from_data_stream( + env.from_collection( + double_arrays, + type_info=Types.ROW_NAMED( + ['array'], + [Types.PRIMITIVE_ARRAY(Types.DOUBLE())]) + )) + +# convert each double array to a dense vector +output_table = input_table.select(array_to_vector(col('array')).alias('vector')) + +# extract and display the results +field_names = output_table.get_schema().get_field_names() + +output_values = [x[field_names.index('vector')] for x in + t_env.to_data_stream(output_table).execute_and_collect()] + +output_values.sort(key=lambda x: x.get(1)) + +for i in range(len(output_values)): + double_array = double_arrays[i][0] + vector = output_values[i] + print("Input double array: %s \t output vector: %s" % (double_array, vector)) +``` +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/recommendation/_index.md b/docs/content.zh/docs/operators/recommendation/_index.md new file mode 100644 index 000000000..7b8f23e54 --- /dev/null +++ b/docs/content.zh/docs/operators/recommendation/_index.md @@ -0,0 +1,25 @@ +--- +title: Recommendation +bookCollapseSection: true +weight: 1 +aliases: + - /operators/recommendation/ +--- + diff --git a/docs/content.zh/docs/operators/recommendation/swing.md b/docs/content.zh/docs/operators/recommendation/swing.md new file mode 100644 index 000000000..601ee820f --- /dev/null +++ b/docs/content.zh/docs/operators/recommendation/swing.md @@ -0,0 +1,194 @@ +--- +title: "Swing" +type: docs +aliases: +- /operators/recommendation/swing.html +--- + + + +## Swing + +An AlgoOperator which implements the Swing algorithm. + +Swing is an item recall algorithm. The topology of user-item graph usually can be described as +user-item-user or item-user-item, which are like 'swing'. For example, if both user u and user v +have purchased the same commodity i, they will form a relationship diagram similar to a swing. If +u and v have purchased commodity j in addition to i, it is supposed i +and j are similar. + +See "Large Scale Product Graph Construction for Recommendation in +E-commerce" by Xiaoyong Yang, Yadong Zhu and Yi Zhang. + +### Input Columns + +| Param name | Type | Default | Description | +|:-----------|:-----|:---------|:------------| +| itemCol | Long | `"item"` | Item id. | +| userCol | Long | `"user"` | User id. | +### Output Columns + +| Param name | Type | Default | Description | +|:-----------|:-------|:-----------|:-----------------------------------------------------------------------------------------------| +| itemCol | Long | `"item"` | Item id. | +| outputCol | String | `"output"` | Top k similar items and their corresponding scores. (e.g. "item_1,0.9;item_2,0.7;item_3,0.35") | + +### Parameters + +Below are the parameters required by `Swing`. + +| Key | Default | Type | Required | Description | +|:------------------|:-----------|:--------|:---------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| userCol | `"user"` | String | no | User column name. | +| itemCol | `"item"` | String | no | Item column name. | +| maxUserNumPerItem | `1000` | Integer | no | The max number of user(purchasers) for each item. If the number of user is larger than this value, then only maxUserNumPerItem users will be sampled and considered in the computation of similarity between two items. | +| k | `100` | Integer | no | The max number of similar items to output for each item. | +| minUserBehavior | `10` | Integer | no | The min number of items for a user purchases. If the items purchased by a user is smaller than this value, then this user is filtered out while gathering data. This can affect the speed of the computation. Set minUserBehavior larger in case the swing recommendation progresses very slowly. | +| maxUserBehavior | `1000` | Integer | no | The max number of items for a user purchases. If the items purchased by a user is larger than this value, then this user is filtered out while gathering data. This can affect the speed of the computation. Set maxUserBehavior smaller in case the swing recommendation progresses very slowly. The IllegalArgumentException is raised if the value of maxUserBehavior is smaller than minUserBehavior. | +| alpha1 | `15` | Integer | no | Smooth factor for number of users that have purchased one item. The higher alpha1 is, the less purchasing behavior contributes to the similarity score. | +| alpha2 | `0` | Integer | no | Smooth factor for number of users that have purchased the two target items. The higher alpha2 is, the less purchasing behavior contributes to the similarity score. | +| beta | `0.3` | Double | no | Decay factor for number of users that have purchased one item. The higher beta is, the less purchasing behavior contributes to the similarity score. | +| outputCol | `"output"` | String | no | Output column name. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +package org.apache.flink.ml.examples.recommendation; + +import org.apache.flink.ml.recommendation.swing.Swing; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** + * Simple program that creates a Swing instance and uses it to generate recommendations for items. + */ +public class SwingExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(0L, 10L), + Row.of(0L, 11L), + Row.of(0L, 12L), + Row.of(1L, 13L), + Row.of(1L, 12L), + Row.of(2L, 10L), + Row.of(2L, 11L), + Row.of(2L, 12L), + Row.of(3L, 13L), + Row.of(3L, 12L)); + + Table inputTable = tEnv.fromDataStream(inputStream).as("user", "item"); + + // Creates a Swing object and initializes its parameters. + Swing swing = new Swing().setUserCol("user").setItemCol("item").setMinUserBehavior(1); + + // Transforms the data. + Table[] outputTable = swing.transform(inputTable); + + // Extracts and displays the result of swing algorithm. + for (CloseableIterator it = outputTable[0].execute().collect(); it.hasNext(); ) { + Row row = it.next(); + + long mainItem = row.getFieldAs(0); + String itemRankScore = row.getFieldAs(1); + + System.out.printf("item: %d, top-k similar items: %s\n", mainItem, itemRankScore); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python + +# Simple program that creates a Swing instance and gives recommendations for items. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.table import StreamTableEnvironment + +from pyflink.ml.recommendation.swing import Swing + +# Creates a new StreamExecutionEnvironment. +env = StreamExecutionEnvironment.get_execution_environment() + +# Creates a StreamTableEnvironment. +t_env = StreamTableEnvironment.create(env) + +# Generates input data. +input_table = t_env.from_data_stream( + env.from_collection([ + (0, 10), + (0, 11), + (0, 12), + (1, 13), + (1, 12), + (2, 10), + (2, 11), + (2, 12), + (3, 13), + (3, 12) + ], + type_info=Types.ROW_NAMED( + ['user', 'item'], + [Types.LONG(), Types.LONG()]) + )) + +# Creates a swing object and initialize its parameters. +swing = Swing() + .set_item_col('item') + .set_user_col("user") + .set_min_user_behavior(1) + +# Transforms the data to Swing algorithm result. +output_table = swing.transform(input_table) + +# Extracts and display the results. +field_names = output_table[0].get_schema().get_field_names() + +results = t_env.to_data_stream( + output_table[0]).execute_and_collect() + +for result in results: + main_item = result[field_names.index(swing.get_item_col())] + item_rank_score = result[1] + print(f'item: {main_item}, top-k similar items: {item_rank_score}') + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/regression/_index.md b/docs/content.zh/docs/operators/regression/_index.md new file mode 100644 index 000000000..f718afe1a --- /dev/null +++ b/docs/content.zh/docs/operators/regression/_index.md @@ -0,0 +1,25 @@ +--- +title: Regression +bookCollapseSection: true +weight: 1 +aliases: + - /operators/regression/ +--- + diff --git a/docs/content.zh/docs/operators/regression/linearregression.md b/docs/content.zh/docs/operators/regression/linearregression.md new file mode 100644 index 000000000..3b9717be5 --- /dev/null +++ b/docs/content.zh/docs/operators/regression/linearregression.md @@ -0,0 +1,188 @@ +--- +title: "Linear Regression" +type: docs +aliases: +- /operators/regression/linearregression.html +--- + + + +## Linear Regression + +Linear Regression is a kind of regression analysis by modeling the relationship +between a scalar response and one or more explanatory variables. + +### Input Columns + +| Param name | Type | Default | Description | +| :---------- | :------ | :----------- |:------------------| +| featuresCol | Vector | `"features"` | Feature vector. | +| labelCol | Integer | `"label"` | Label to predict. | +| weightCol | Double | `"weight"` | Weight of sample. | + +### Output Columns + +| Param name | Type | Default | Description | +| :------------ | :------ | :------------- |:------------------------------| +| predictionCol | Integer | `"prediction"` | Label of the max probability. | + +### Parameters + +Below are the parameters required by `LinearRegressionModel`. + +| Key | Default | Type | Required | Description | +| ------------- | -------------- | ------ | -------- | ----------------------- | +| featuresCol | `"features"` | String | no | Features column name. | +| predictionCol | `"prediction"` | String | no | Prediction column name. | + +`LinearRegression` needs parameters above and also below. + +| Key | Default | Type | Required | Description | +| --------------- | --------- | ------- | -------- | ----------------------------------------------- | +| labelCol | `"label"` | String | no | Label column name. | +| weightCol | `null` | String | no | Weight column name. | +| maxIter | `20` | Integer | no | Maximum number of iterations. | +| reg | `0.` | Double | no | Regularization parameter. | +| elasticNet | `0.` | Double | no | ElasticNet parameter. | +| learningRate | `0.1` | Double | no | Learning rate of optimization method. | +| globalBatchSize | `32` | Integer | no | Global batch size of training algorithms. | +| tol | `1e-6` | Double | no | Convergence tolerance for iterative algorithms. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.ml.regression.linearregression.LinearRegression; +import org.apache.flink.ml.regression.linearregression.LinearRegressionModel; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that trains a LinearRegression model and uses it for regression. */ +public class LinearRegressionExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + DataStream inputStream = + env.fromElements( + Row.of(Vectors.dense(2, 1), 4.0, 1.0), + Row.of(Vectors.dense(3, 2), 7.0, 1.0), + Row.of(Vectors.dense(4, 3), 10.0, 1.0), + Row.of(Vectors.dense(2, 4), 10.0, 1.0), + Row.of(Vectors.dense(2, 2), 6.0, 1.0), + Row.of(Vectors.dense(4, 3), 10.0, 1.0), + Row.of(Vectors.dense(1, 2), 5.0, 1.0), + Row.of(Vectors.dense(5, 3), 11.0, 1.0)); + Table inputTable = tEnv.fromDataStream(inputStream).as("features", "label", "weight"); + + // Creates a LinearRegression object and initializes its parameters. + LinearRegression lr = new LinearRegression().setWeightCol("weight"); + + // Trains the LinearRegression Model. + LinearRegressionModel lrModel = lr.fit(inputTable); + + // Uses the LinearRegression Model for predictions. + Table outputTable = lrModel.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector features = (DenseVector) row.getField(lr.getFeaturesCol()); + double expectedResult = (Double) row.getField(lr.getLabelCol()); + double predictionResult = (Double) row.getField(lr.getPredictionCol()); + System.out.printf( + "Features: %s \tExpected Result: %s \tPrediction Result: %s\n", + features, expectedResult, predictionResult); + } + } +} + +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that trains a LinearRegression model and uses it for +# regression. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.regression.linearregression import LinearRegression +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_table = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense(2, 1), 4., 1.), + (Vectors.dense(3, 2), 7., 1.), + (Vectors.dense(4, 3), 10., 1.), + (Vectors.dense(2, 4), 10., 1.), + (Vectors.dense(2, 2), 6., 1.), + (Vectors.dense(4, 3), 10., 1.), + (Vectors.dense(1, 2), 5., 1.), + (Vectors.dense(5, 3), 11., 1.), + ], + type_info=Types.ROW_NAMED( + ['features', 'label', 'weight'], + [DenseVectorTypeInfo(), Types.DOUBLE(), Types.DOUBLE()]) + )) + +# create a linear regression object and initialize its parameters +linear_regression = LinearRegression().set_weight_col('weight') + +# train the linear regression model +model = linear_regression.fit(input_table) + +# use the linear regression model for predictions +output = model.transform(input_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + features = result[field_names.index(linear_regression.get_features_col())] + expected_result = result[field_names.index(linear_regression.get_label_col())] + prediction_result = result[field_names.index(linear_regression.get_prediction_col())] + print('Features: ' + str(features) + ' \tExpected Result: ' + str(expected_result) + + ' \tPrediction Result: ' + str(prediction_result)) + +``` + +{{< /tab>}} + +{{< /tabs>}} diff --git a/docs/content.zh/docs/operators/stats/_index.md b/docs/content.zh/docs/operators/stats/_index.md new file mode 100644 index 000000000..1c7ca257f --- /dev/null +++ b/docs/content.zh/docs/operators/stats/_index.md @@ -0,0 +1,25 @@ +--- +title: Stats +bookCollapseSection: true +weight: 1 +aliases: + - /operators/stats/ +--- + diff --git a/docs/content.zh/docs/operators/stats/chisqtest.md b/docs/content.zh/docs/operators/stats/chisqtest.md new file mode 100644 index 000000000..94800a0f6 --- /dev/null +++ b/docs/content.zh/docs/operators/stats/chisqtest.md @@ -0,0 +1,187 @@ +--- +title: "ChiSqTest" +type: docs +aliases: +- /operators/stats/chisqtest.html +--- + + + +## ChiSqTest + +Chi-square Test computes the statistics of independence of variables in a contingency table, +e.g., p-value, and DOF(degree of freedom) for each input feature. The contingency table is +constructed from the observed categorical values. + +### Input Columns + +| Param name | Type | Default | Description | +|:------------|:-------|:-------------|:-----------------------| +| featuresCol | Vector | `"features"` | Feature vector. | +| labelCol | Number | `"label"` | Label of the features. | + +### Output Columns + +If the output result is not flattened, the output columns are as follows. + +| Column name | Type | Description | +|--------------------|-----------|--------------------------------------------------------------------------------------------------------------------------------------------------------| +| "pValues" | Vector | Probability of obtaining a test statistic result at least as extreme as the one that was actually observed, assuming that the null hypothesis is true. | +| "degreesOfFreedom" | Int Array | Degree of freedom of the hypothesis test. | +| "statistics" | Vector | Test statistic. | + +If the output result is flattened, the output columns are as follows. + +| Column name | Type | Description | +|-------------------|--------|--------------------------------------------------------------------------------------------------------------------------------------------------------| +| "featureIndex" | Int | Index of the feature in the input vectors. | +| "pValue" | Double | Probability of obtaining a test statistic result at least as extreme as the one that was actually observed, assuming that the null hypothesis is true. | +| "degreeOfFreedom" | Int | Degree of freedom of the hypothesis test. | +| "statistic" | Double | Test statistic. | + +### Parameters + +| Key | Default | Type | Required | Description | +|-------------|--------------|---------|----------|------------------------------------------------------------------------------------------| +| labelCol | `"label"` | String | no | Label column name. | +| featuresCol | `"features"` | String | no | Features column name. | +| flatten | `false` | Boolean | no | If false, the returned table contains only a single row, otherwise, one row per feature. | + +### Examples + +{{< tabs examples >}} + +{{< tab "Java">}} + +```java +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.ml.stats.chisqtest.ChiSqTest; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +/** Simple program that creates a ChiSqTest instance and uses it for statistics. */ +public class ChiSqTestExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + // Generates input data. + Table inputTable = + tEnv.fromDataStream( + env.fromElements( + Row.of(0., Vectors.dense(5, 1.)), + Row.of(2., Vectors.dense(6, 2.)), + Row.of(1., Vectors.dense(7, 2.)), + Row.of(1., Vectors.dense(5, 4.)), + Row.of(0., Vectors.dense(5, 1.)), + Row.of(2., Vectors.dense(6, 2.)), + Row.of(1., Vectors.dense(7, 2.)), + Row.of(1., Vectors.dense(5, 4.)), + Row.of(2., Vectors.dense(5, 1.)), + Row.of(0., Vectors.dense(5, 2.)), + Row.of(0., Vectors.dense(5, 2.)), + Row.of(1., Vectors.dense(9, 4.)), + Row.of(1., Vectors.dense(9, 3.)))) + .as("label", "features"); + + // Creates a ChiSqTest object and initializes its parameters. + ChiSqTest chiSqTest = + new ChiSqTest().setFlatten(true).setFeaturesCol("features").setLabelCol("label"); + + // Uses the ChiSqTest object for statistics. + Table outputTable = chiSqTest.transform(inputTable)[0]; + + // Extracts and displays the results. + for (CloseableIterator it = outputTable.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + System.out.printf( + "Feature Index: %s\tP Value: %s\tDegree of Freedom: %s\tStatistics: %s\n", + row.getField("featureIndex"), + row.getField("pValue"), + row.getField("degreeOfFreedom"), + row.getField("statistic")); + } + } +} +``` + +{{< /tab>}} + +{{< tab "Python">}} + +```python +# Simple program that creates a ChiSqTest instance and uses it for statistics. + +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.stats.chisqtest import ChiSqTest +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_table = t_env.from_data_stream( + env.from_collection([ + (0., Vectors.dense(5, 1.)), + (2., Vectors.dense(6, 2.)), + (1., Vectors.dense(7, 2.)), + (1., Vectors.dense(5, 4.)), + (0., Vectors.dense(5, 1.)), + (2., Vectors.dense(6, 2.)), + (1., Vectors.dense(7, 2.)), + (1., Vectors.dense(5, 4.)), + (2., Vectors.dense(5, 1.)), + (0., Vectors.dense(5, 2.)), + (0., Vectors.dense(5, 2.)), + (1., Vectors.dense(9, 4.)), + (1., Vectors.dense(9, 3.)) + ], + type_info=Types.ROW_NAMED( + ['label', 'features'], + [Types.DOUBLE(), DenseVectorTypeInfo()])) +) + +# create a ChiSqTest object and initialize its parameters +chi_sq_test = ChiSqTest().set_flatten(True) + +# use the ChiSqTest object for statistics +output = chi_sq_test.transform(input_table)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + print("Feature Index: %s\tP Value: %s\tDegree of Freedom: %s\tStatistics: %s" % + (result[field_names.index('featureIndex')], result[field_names.index('pValue')], + result[field_names.index('degreeOfFreedom')], result[field_names.index('statistic')])) + +``` + +{{< /tab>}} + +{{< /tabs>}} + diff --git a/docs/content.zh/docs/try-flink-ml/_index.md b/docs/content.zh/docs/try-flink-ml/_index.md new file mode 100644 index 000000000..554087683 --- /dev/null +++ b/docs/content.zh/docs/try-flink-ml/_index.md @@ -0,0 +1,25 @@ +--- +title: Try Flink ML +icon: +bold: true +bookCollapseSection: true +weight: 1 +--- + diff --git a/docs/content.zh/docs/try-flink-ml/java/_index.md b/docs/content.zh/docs/try-flink-ml/java/_index.md new file mode 100644 index 000000000..7a5652525 --- /dev/null +++ b/docs/content.zh/docs/try-flink-ml/java/_index.md @@ -0,0 +1,23 @@ +--- +title: Java +bookCollapseSection: true +weight: 1 +--- + diff --git a/docs/content.zh/docs/try-flink-ml/java/build-your-own-project.md b/docs/content.zh/docs/try-flink-ml/java/build-your-own-project.md new file mode 100644 index 000000000..f445d06a7 --- /dev/null +++ b/docs/content.zh/docs/try-flink-ml/java/build-your-own-project.md @@ -0,0 +1,366 @@ +--- +title: "Building your own Flink ML project" +weight: 2 +type: docs +aliases: +- /try-flink-ml/java/building-your-own-project.html +--- + + +# Building your own Flink ML project + +This document provides a quick introduction to using Flink ML. Readers of this +document will be guided to create a simple Flink job that trains a Machine +Learning Model and uses it to provide prediction service. + +## What Will You Be Building? + +Kmeans is a widely-used clustering algorithm and has been supported by Flink ML. +This walkthrough guides you to create a Flink job with Flink ML that initializes +and trains a Kmeans model, and finally uses it to predict the cluster id of +certain data points. + +## Prerequisites + +This walkthrough assumes that you have some familiarity with Java, but you +should be able to follow along even if you are coming from a different +programming language. + +## Help, I’m Stuck! + +If you get stuck, check out the [community support +resources](https://flink.apache.org/gettinghelp.html). In particular, Apache +Flink's [user mailing +list](https://flink.apache.org/community.html#mailing-lists) is consistently +ranked as one of the most active of any Apache project and a great way to get +help quickly. + +## How To Follow Along + +If you want to follow along, you will require a computer with: + +- Java 8 +- Maven 3 + +{{< unstable >}} + +Before walking through the following sections of this document, make sure you +have downloaded Flink ML's latest code and installed Flink ML's Java SDK in your +local machine. You can refer to this [guideline]({{< ref +"docs/development/build-and-install#build-and-install-java-sdk" >}}) for how to +build and install Flink ML. + +{{< /unstable >}} + +While commands to be executed in a CLI are provided to walk through this example +in the following steps, it is recommended to use an IDE, like IntelliJ IDEA, to +manage, build and execute the example codes below. + +Please use the following command to create a Flink Maven Archetype that provides +the basic skeleton of a project, with some necessary Flink dependencies. + +```shell +$ mvn archetype:generate \ + -DarchetypeGroupId=org.apache.flink \ + -DarchetypeArtifactId=flink-quickstart-java \ + -DarchetypeVersion=1.16.1 \ + -DgroupId=kmeans-example \ + -DartifactId=kmeans-example \ + -Dversion=0.1 \ + -Dpackage=myflinkml \ + -DinteractiveMode=false +``` + +The command above would create a maven project named `kmeans-example` in your +current directory with the following structure: + +``` +$ tree kmeans-example +kmeans-example +├── pom.xml +└── src + └── main + ├── java + │ └── myflinkml + │ └── DataStreamJob.java + └── resources + └── log4j2.properties +``` + +Change the dependencies provided in `pom.xml` as follows: + +```xml + + org.apache.flink + flink-ml-uber-1.17 + 2.4-SNAPSHOT + provided + + + + org.apache.flink + flink-connector-files + ${flink.version} + provided + + + + org.apache.flink + flink-clients + ${flink.version} + provided + + + + org.apache.flink + flink-table-api-java-bridge + ${flink.version} + provided + + + + org.apache.flink + flink-table-runtime + ${flink.version} + provided + + + + org.apache.flink + flink-table-planner-loader + ${flink.version} + provided + + + + org.apache.flink + statefun-flink-core + 3.2.0 + + + org.apache.flink + flink-streaming-java_2.12 + + + org.apache.flink + flink-metrics-dropwizard + + + +``` + +Create file `src/main/java/myflinkml/KMeansExample.java`, and save the following +content into the file. You may feel free to ignore and delete +`src/main/java/myflinkml/DataStreamJob.java` as it is not used in this +walkthrough. + +```java +package myflinkml; + +import org.apache.flink.ml.clustering.kmeans.KMeans; +import org.apache.flink.ml.clustering.kmeans.KMeansModel; +import org.apache.flink.ml.linalg.DenseVector; +import org.apache.flink.ml.linalg.Vectors; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; +import org.apache.flink.util.CloseableIterator; + +public class KMeansExample { + public static void main(String[] args) { + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + + String featuresCol = "features"; + String predictionCol = "prediction"; + + // Generate train data and predict data as DataStream. + DataStream inputStream = env.fromElements( + Vectors.dense(0.0, 0.0), + Vectors.dense(0.0, 0.3), + Vectors.dense(0.3, 0.0), + Vectors.dense(9.0, 0.0), + Vectors.dense(9.0, 0.6), + Vectors.dense(9.6, 0.0) + ); + + // Convert data from DataStream to Table, as Flink ML uses Table API. + Table input = tEnv.fromDataStream(inputStream).as(featuresCol); + + // Creates a K-means object and initialize its parameters. + KMeans kmeans = new KMeans() + .setK(2) + .setSeed(1L) + .setFeaturesCol(featuresCol) + .setPredictionCol(predictionCol); + + // Trains the K-means Model. + KMeansModel model = kmeans.fit(input); + + // Use the K-means Model for predictions. + Table output = model.transform(input)[0]; + + // Extracts and displays prediction result. + for (CloseableIterator it = output.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector vector = (DenseVector) row.getField(featuresCol); + int clusterId = (Integer) row.getField(predictionCol); + System.out.println("Vector: " + vector + "\tCluster ID: " + clusterId); + } + } +} +``` + +After placing the code above into your Maven project, you may use the following +command or your IDE to build and execute the example job. + +```shell +cd kmeans-example/ +mvn clean package +mvn exec:java -Dexec.mainClass="myflinkml.KMeansExample" -Dexec.classpathScope="compile" +``` + +If you are running the project in an IDE, you may get a +`java.lang.NoClassDefFoundError` exception. This is probably because you do not +have all required Flink dependencies implicitly loaded into the classpath. + +- IntelliJ IDEA: Go to Run > Edit Configurations > Modify options > Select + `include dependencies with "Provided" scope`. This run configuration will now + include all required classes to run the application from within the IDE. + +After executing the job, information like below will be printed out to your +terminal window. + +``` +Vector: [0.3, 0.0] Cluster ID: 1 +Vector: [9.6, 0.0] Cluster ID: 0 +Vector: [9.0, 0.6] Cluster ID: 0 +Vector: [0.0, 0.0] Cluster ID: 1 +Vector: [0.0, 0.3] Cluster ID: 1 +Vector: [9.0, 0.0] Cluster ID: 0 +``` + + +The program might get stuck after printing out the information above, and you +may need to enter ^C to terminate the process. This only happens when the +program is executed locally and would not happen when the job is submitted to a +Flink cluster. + +## Breaking Down The Code + +### The Execution Environment + +The first lines set up the `StreamExecutionEnvironment` to execute the Flink ML +job. You would have been familiar with this concept if you have experience using +Flink. For the example program in this document, a simple +`StreamExecutionEnvironment` without specific configurations would be enough. + +Given that Flink ML uses Flink's Table API, a `StreamTableEnvironment` would +also be necessary for the following program. + +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); +``` + +### Creating Training & Inference Data Table + +Then the program creates the Table containing data for the training and +prediction process of the following Kmeans algorithm. Flink ML operators search +the names of the columns of the input table for input data, and produce +prediction results to designated column of the output Table. + +```java +DataStream inputStream = env.fromElements( + Vectors.dense(0.0, 0.0), + Vectors.dense(0.0, 0.3), + Vectors.dense(0.3, 0.0), + Vectors.dense(9.0, 0.0), + Vectors.dense(9.0, 0.6), + Vectors.dense(9.6, 0.0) +); + +Table input = tEnv.fromDataStream(inputStream).as(featuresCol); +``` + +### Creating, Configuring, Training & Using Kmeans + +Flink ML classes for Kmeans algorithm include `KMeans` and `KMeansModel`. +`KMeans` implements the training process of Kmeans algorithm based on the +provided training data, and finally generates a `KMeansModel`. +`KmeansModel.transform()` method encodes the Transformation logic of this +algorithm and is used for predictions. + +Both `KMeans` and `KMeansModel` provides getter/setter methods for Kmeans +algorithm's configuration parameters. The example program explicitly sets the +following parameters, and other configuration parameters will have their default +values used. + +- `K`, the number of clusters to create +- `seed`, the random seed to initialize cluster centers +- `featuresCol`, name of the column containing input feature vectors +- `predictionCol`, name of the column to output prediction results + +When the program invokes `KMeans.fit()` to generate a `KMeansModel`, the +`KMeansModel` will inherit the `KMeans` object's configuration parameters. Thus +it is supported to set `KMeansModel`'s parameters directly in `KMeans` object. + +```java +KMeans kmeans = new KMeans() + .setK(2) + .setSeed(1L) + .setFeaturesCol(featuresCol) + .setPredictionCol(predictionCol); + +KMeansModel model = kmeans.fit(input); + +Table output = model.transform(input)[0]; + +``` + +### Collecting Prediction Result + +Like all other Flink programs, the codes described in the sections above only +configures the computation graph of a Flink job, and the program only evaluates +the computation logic and collects outputs after the `execute()` method is +invoked. Collected outputs from the output table would be `Row`s in which +`featuresCol` contains input feature vectors, and `predictionCol` contains +output prediction results, i.e., cluster IDs. + +```java +for (CloseableIterator it = output.execute().collect(); it.hasNext(); ) { + Row row = it.next(); + DenseVector vector = (DenseVector) row.getField(featuresCol); + int clusterId = (Integer) row.getField(predictionCol); + System.out.println("Vector: " + vector + "\tCluster ID: " + clusterId); +} +``` + +``` +Vector: [0.3, 0.0] Cluster ID: 1 +Vector: [9.6, 0.0] Cluster ID: 0 +Vector: [9.0, 0.6] Cluster ID: 0 +Vector: [0.0, 0.0] Cluster ID: 1 +Vector: [0.0, 0.3] Cluster ID: 1 +Vector: [9.0, 0.0] Cluster ID: 0 +``` + diff --git a/docs/content.zh/docs/try-flink-ml/java/quick-start.md b/docs/content.zh/docs/try-flink-ml/java/quick-start.md new file mode 100644 index 000000000..9e685a983 --- /dev/null +++ b/docs/content.zh/docs/try-flink-ml/java/quick-start.md @@ -0,0 +1,139 @@ +--- +title: "Quick Start" +weight: 1 +type: docs +aliases: +- /try-flink-ml/java/quick-start.html +--- + + +# Quick Start + +This document provides a quick introduction to using Flink ML. Readers of this +document will be guided to submit a simple Flink job that trains a Machine +Learning Model and uses it to provide prediction service. + +## Help, I’m Stuck! + +If you get stuck, check out the [community support +resources](https://flink.apache.org/gettinghelp.html). In particular, Apache +Flink's [user mailing +list](https://flink.apache.org/community.html#mailing-lists) is consistently +ranked as one of the most active of any Apache project and a great way to get +help quickly. + +## Prerequisites + +Make sure Java 8 or a higher version has been installed in your local machine. +To check the Java version installed, type in your terminal: + +```shell +$ java -version +``` + +## Download Flink + +Download [Flink 1.17](https://flink.apache.org/downloads.html), then extract the archive: + +```shell +$ tar -xzf flink-*.tgz +``` + +## Set Up Flink Environment Variables + +Run the following commands after having downloaded Flink: + +```bash +cd ${path_to_flink} +export FLINK_HOME=`pwd` +``` + +## Add Flink ML library to Flink's library folder + +You need to copy Flink ML's library files to Flink's folder for proper +initialization. + +{{< stable >}} + +Please download [Flink ML Python +source](https://flink.apache.org/downloads.html) and extract the jar files into +Flink's library folder. + +```shell +tar -xzf apache-flink-ml*.tar.gz +cp apache-flink-ml-*/deps/lib/* $FLINK_HOME/lib/ +``` + +{{< /stable >}} {{< unstable >}} + +Please walk through this [guideline]({{< ref +"docs/development/build-and-install#build-and-install-java-sdk" >}}) to build +Flink ML's Java SDK. After that, you may copy the generated library files to +Flink's folder with the following commands. + +```shell +cd ${path_to_flink_ml} +cp ./flink-ml-dist/target/flink-ml-*-bin/flink-ml*/lib/*.jar $FLINK_HOME/lib/ +``` + +{{< /unstable >}} + +## Run Flink ML example job + +Please start a Flink standalone cluster in your local environment with the +following command. + +```bash +$FLINK_HOME/bin/start-cluster.sh +``` + +You should be able to navigate to the web UI at +[localhost:8081](http://localhost:8081/) to view the Flink dashboard and see +that the cluster is up and running. + +Then you may submit Flink ML examples to the cluster as follows. + +```bash +$FLINK_HOME/bin/flink run -c org.apache.flink.ml.examples.clustering.KMeansExample $FLINK_HOME/lib/flink-ml-examples*.jar +``` + +The command above would submit and execute Flink ML's `KMeansExample` job. There +are also example jobs for other Flink ML algorithms, and you can find them in +`flink-ml-examples` module. + +A sample output in your terminal is as follows. + +``` +Features: [9.0, 0.0] Cluster ID: 1 +Features: [0.3, 0.0] Cluster ID: 0 +Features: [0.0, 0.3] Cluster ID: 0 +Features: [9.6, 0.0] Cluster ID: 1 +Features: [0.0, 0.0] Cluster ID: 0 +Features: [9.0, 0.6] Cluster ID: 1 + +``` + +Now you have successfully run a Flink ML job. + +Finally, you can stop the Flink standalone cluster with the following command. + +```bash +$FLINK_HOME/bin/stop-cluster.sh +``` diff --git a/docs/content.zh/docs/try-flink-ml/python/_index.md b/docs/content.zh/docs/try-flink-ml/python/_index.md new file mode 100644 index 000000000..f86e4cc69 --- /dev/null +++ b/docs/content.zh/docs/try-flink-ml/python/_index.md @@ -0,0 +1,23 @@ +--- +title: Python +bookCollapseSection: true +weight: 1 +--- + diff --git a/docs/content.zh/docs/try-flink-ml/python/quick-start.md b/docs/content.zh/docs/try-flink-ml/python/quick-start.md new file mode 100644 index 000000000..08fac05bf --- /dev/null +++ b/docs/content.zh/docs/try-flink-ml/python/quick-start.md @@ -0,0 +1,338 @@ +--- +title: "Quick Start" +weight: 1 +type: docs +aliases: +- /try-flink-ml/python/quick-start.html + +--- + + + +# Quick Start + +This document provides a quick introduction to using Flink ML. Readers of this +document will be guided to create a simple Flink job that trains a Machine +Learning Model and uses it to provide prediction service. + +## What Will You Be Building? + +Kmeans is a widely-used clustering algorithm and has been supported by Flink ML. +This walkthrough guides you to create a Flink job with Flink ML that initializes +and trains a Kmeans model, and finally uses it to predict the cluster id of +certain data points. + +## Prerequisites + +This walkthrough assumes that you have some familiarity with Python, but you +should be able to follow along even if you come from a different programming +language. + +## Help, I’m Stuck! + +If you get stuck, check out the [community support +resources](https://flink.apache.org/gettinghelp.html). In particular, Apache +Flink's [user mailing +list](https://flink.apache.org/community.html#mailing-lists) is consistently +ranked as one of the most active of any Apache project and a great way to get +help quickly. + +## How To Follow Along + +If you want to follow along, you will require a computer with: + +{{< stable >}} +- Java 8 +- Python 3.7 or 3.8 {{< /stable >}} {{< unstable >}} +- Java 8 +- Maven 3 +- Python 3.7 or 3.8 {{< /unstable >}} + +{{< stable >}} + +This walkthrough requires installing Flink ML Python SDK, which is available on +[PyPi](https://pypi.org/project/apache-flink-ml/) and can be easily installed +using pip. + +```bash +$ python -m pip install apache-flink-ml=={{< version >}} +``` + +{{< /stable >}} {{< unstable >}} + +Please walk through this [guideline]({{< ref +"docs/development/build-and-install#build-and-install-python-sdk" >}}) to build +and install Flink ML's Python SDK in your local environment. + +{{< /unstable >}} + +## Writing a Flink ML Python Program + +Flink ML programs begin by setting up the `StreamExecutionEnvironment` to +execute the Flink ML job. You would have been familiar with this concept if you +have experience using Flink. For the example program in this document, a simple +`StreamExecutionEnvironment` without specific configurations would be enough. + +Given that Flink ML uses Flink's Table API, a `StreamTableEnvironment` would +also be necessary for the following program. + +```python +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) +``` + +Then you can create the Table containing data for the training and prediction +process of the following Kmeans algorithm. Flink ML operators search the names +of the columns of the input table for input data, and produce prediction results +to designated column of the output Table. + +```python +# generate input data +input_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([0.0, 0.0]),), + (Vectors.dense([0.0, 0.3]),), + (Vectors.dense([0.3, 3.0]),), + (Vectors.dense([9.0, 0.0]),), + (Vectors.dense([9.0, 0.6]),), + (Vectors.dense([9.6, 0.0]),), + ], + type_info=Types.ROW_NAMED( + ['features'], + [DenseVectorTypeInfo()]))) +``` + +Flink ML classes for Kmeans algorithm include `KMeans` and `KMeansModel`. +`KMeans` implements the training process of Kmeans algorithm based on the +provided training data, and finally generates a `KMeansModel`. +`KmeansModel.transform()` method encodes the Transformation logic of this +algorithm and is used for predictions. + +Both `KMeans` and `KMeansModel` provides getter/setter methods for Kmeans +algorithm's configuration parameters. This example program explicitly sets the +following parameters, and other configuration parameters will have their default +values used. + +- `k`, the number of clusters to create +- `seed`, the random seed to initialize cluster centers + +When the program invokes `KMeans.fit()` to generate a `KMeansModel`, the +`KMeansModel` will inherit the `KMeans` object's configuration parameters. Thus +it is supported to set `KMeansModel`'s parameters directly in `KMeans` object. + +```python +# create a kmeans object and initialize its parameters +kmeans = KMeans().set_k(2).set_seed(1) + +# train the kmeans model +model = kmeans.fit(input_data) + +# use the kmeans model for predictions +output = model.transform(input_data)[0] + +``` + +Like all other Flink programs, the codes described in the sections above only +configures the computation graph of a Flink job, and the program only evaluates +the computation logic and collects outputs after the `execute()` method is +invoked. Collected outputs from the output table would be `Row`s in which +`featuresCol` contains input feature vectors, and `predictionCol` contains +output prediction results, i.e., cluster IDs. + +```python +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + features = result[field_names.index(kmeans.get_features_col())] + cluster_id = result[field_names.index(kmeans.get_prediction_col())] + print('Features: ' + str(features) + ' \tCluster Id: ' + str(cluster_id)) +``` + +The complete code so far: + +```python +from pyflink.common import Types +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo +from pyflink.ml.clustering.kmeans import KMeans +from pyflink.table import StreamTableEnvironment + +# create a new StreamExecutionEnvironment +env = StreamExecutionEnvironment.get_execution_environment() + +# create a StreamTableEnvironment +t_env = StreamTableEnvironment.create(env) + +# generate input data +input_data = t_env.from_data_stream( + env.from_collection([ + (Vectors.dense([0.0, 0.0]),), + (Vectors.dense([0.0, 0.3]),), + (Vectors.dense([0.3, 3.0]),), + (Vectors.dense([9.0, 0.0]),), + (Vectors.dense([9.0, 0.6]),), + (Vectors.dense([9.6, 0.0]),), + ], + type_info=Types.ROW_NAMED( + ['features'], + [DenseVectorTypeInfo()]))) + +# create a kmeans object and initialize its parameters +kmeans = KMeans().set_k(2).set_seed(1) + +# train the kmeans model +model = kmeans.fit(input_data) + +# use the kmeans model for predictions +output = model.transform(input_data)[0] + +# extract and display the results +field_names = output.get_schema().get_field_names() +for result in t_env.to_data_stream(output).execute_and_collect(): + features = result[field_names.index(kmeans.get_features_col())] + cluster_id = result[field_names.index(kmeans.get_prediction_col())] + print('Features: ' + str(features) + ' \tCluster Id: ' + str(cluster_id)) +``` + +## Executing a Flink ML Python Program locally + +After creating a python file (e.g. kmeans_example.py) and saving the code above +into the file, you can run the example on the command line: + +```shell +python kmeans_example.py +``` + +The command above would build the example job and run it in a local mini +cluster. A sample output in your terminal is as follows. + +``` +Features: [9.6,0.0] Cluster Id: 0 +Features: [9.0,0.6] Cluster Id: 0 +Features: [0.0,0.3] Cluster Id: 1 +Features: [0.0,0.0] Cluster Id: 1 +Features: [0.3,3.0] Cluster Id: 1 +Features: [9.0,0.0] Cluster Id: 0 +``` + +## Executing a Flink ML Python Program on a Flink Cluster + +### Prerequisites + +Make sure Java 8 or a higher version has been installed in your local machine. +To check the Java version installed, type in your terminal: + +```shell +$ java -version +``` + +### Download Flink + +Download [Flink 1.17](https://flink.apache.org/downloads.html), then extract the archive: + +```shell +$ tar -xzf flink-*.tgz +``` + +### Set Up Flink Library and Environment Variables + +Run the following commands after having downloaded Flink: + +```bash +cd ${path_to_flink} +cp opt/flink-python* lib/ +export FLINK_HOME=`pwd` +``` + +### Add Flink ML library to Flink's library folder + +You need to copy Flink ML's library files to Flink's folder for proper +initialization. + +{{< stable >}} + +Please download [Flink ML Python +source](https://flink.apache.org/downloads.html) and extract the jar files into +Flink's library folder. + +```shell +tar -xzf apache-flink-ml*.tar.gz +cp apache-flink-ml-*/deps/lib/* $FLINK_HOME/lib/ +``` + +{{< /stable >}} {{< unstable >}} + +Given that you have followed this [guideline]({{< ref +"docs/development/build-and-install#build-and-install-java-sdk" >}}), you +would have already built Flink ML's Java SDK. Now, you need to copy the +generated library files to Flink's folder with the following commands. + +```shell +cd ${path_to_flink_ml} +cp ./flink-ml-dist/target/flink-ml-*-bin/flink-ml*/lib/*.jar $FLINK_HOME/lib/ +``` + +{{< /unstable >}} + +### Run Flink ML job + +Please start a Flink standalone cluster in your local environment with the +following command. + +```bash +$FLINK_HOME/bin/start-cluster.sh +``` + +You should be able to navigate to the web UI at +[localhost:8081](http://localhost:8081/) to view the Flink dashboard and see +that the cluster is up and running. + + +After creating a python file (e.g. kmeans_example.py) and saving the code above +into the file, you may submit the example job to the cluster as follows. + +```bash +$FLINK_HOME/bin/flink run -py kmeans_example.py +``` + +A sample output in your terminal is as follows. + +``` +Features: [9.6,0.0] Cluster Id: 0 +Features: [9.0,0.6] Cluster Id: 0 +Features: [0.0,0.3] Cluster Id: 1 +Features: [0.0,0.0] Cluster Id: 1 +Features: [0.3,3.0] Cluster Id: 1 +Features: [9.0,0.0] Cluster Id: 0 +``` + +Now you have successfully run the Flink ML job on a Flink Cluster. Other +detailed instructions to submit it to a Flink cluster can be found in [Job +Submission +Examples](https://nightlies.apache.org/flink/flink-docs-master/docs/deployment/cli/#submitting-pyflink-jobs). + +Finally, you can stop the Flink standalone cluster with the following command. + +```bash +$FLINK_HOME/bin/stop-cluster.sh +``` diff --git a/docs/content.zh/versions.md b/docs/content.zh/versions.md new file mode 100644 index 000000000..e59db1041 --- /dev/null +++ b/docs/content.zh/versions.md @@ -0,0 +1,29 @@ +--- +title: Versions +type: docs +bookToc: false +--- + + +# Versions + +An appendix of hosted documentation for all versions of Apache Flink Machine Learning Library. + +{{< all_versions >}}