From e8670dca235ba416692e30c03b4fbce2639e152f Mon Sep 17 00:00:00 2001 From: nlsegerl Date: Tue, 27 Sep 2016 14:14:11 -0700 Subject: [PATCH 1/6] spot_naming duxbay and oni and open-network-insight and all such variations renamed to "spot" --- .gitmodules | 3 -- INSTALL.md | 18 +++++------ README.md | 26 ++++++++-------- build.sbt | 2 +- install_ml.sh | 7 ++--- ml_ops.sh | 6 ++-- .../SpotLDACWrapper.scala} | 24 +++++++-------- .../SuspiciousConnects.scala | 12 ++++---- .../SuspiciousConnectsArgumentParser.scala | 2 +- .../SuspiciousConnectsScoreFunction.scala | 2 +- .../dns/DNSSchema.scala | 4 +-- .../dns/DNSSuspiciousConnectsAnalysis.scala | 14 ++++----- .../dns/DNSWordCreation.scala | 6 ++-- .../dns/model/DNSFeedback.scala | 4 +-- .../dns/model/DNSScoreFunction.scala | 6 ++-- .../model/DNSSuspiciousConnectsModel.scala | 22 +++++++------- .../sideinformation/DNSSideInformation.scala | 19 +++++++----- .../DNSSideInformationFunction.scala | 7 +++-- .../netflow/FlowColumnIndex.scala | 2 +- .../netflow/FlowPostLDA.scala | 4 +-- .../netflow/FlowPreLDA.scala | 10 +++---- .../netflow/FlowSchema.scala | 2 +- .../netflow/FlowSuspiciousConnects.scala | 10 +++---- .../netflow/FlowWordCreation.scala | 6 ++-- .../proxy/ProxyFeedback.scala | 5 ++-- .../proxy/ProxySchema.scala | 2 +- .../ProxySuspiciousConnectsAnalysis.scala | 8 ++--- .../proxy/ProxySuspiciousConnectsModel.scala | 26 ++++++++-------- .../proxy/ProxyWordCreation.scala | 4 +-- .../utilities/CountryCodes.scala | 2 +- .../utilities/DataFrameUtils.scala | 2 +- .../utilities/DomainProcessor.scala | 2 +- .../utilities/Entropy.scala | 2 +- .../utilities/Quantiles.scala | 2 +- .../utilities/TimeUtilities.scala | 2 +- .../utilities/TopDomains.scala | 2 +- .../DNSWordCreationTest.scala | 8 ++--- .../FlowWordCreationTest.scala | 4 +-- .../QuantilesTest.scala | 4 +-- .../SpotLDACWrapperTest.scala} | 30 +++++++++---------- .../testutils/TestingSparkContext.scala | 2 +- .../TestingSparkContextFlatSpec.scala | 2 +- .../TestingSparkContextFunSuite.scala | 2 +- .../TestingSparkContextWordSpec.scala | 2 +- .../utilities/DomainProcessorTest.scala | 6 ++-- 45 files changed, 169 insertions(+), 168 deletions(-) delete mode 100644 .gitmodules rename src/main/scala/org/{opennetworkinsight/OniLDACWrapper.scala => spot/SpotLDACWrapper.scala} (87%) rename src/main/scala/org/{opennetworkinsight => spot}/SuspiciousConnects.scala (80%) rename src/main/scala/org/{opennetworkinsight => spot}/SuspiciousConnectsArgumentParser.scala (99%) rename src/main/scala/org/{opennetworkinsight => spot}/SuspiciousConnectsScoreFunction.scala (96%) rename src/main/scala/org/{opennetworkinsight => spot}/dns/DNSSchema.scala (94%) rename src/main/scala/org/{opennetworkinsight => spot}/dns/DNSSuspiciousConnectsAnalysis.scala (85%) rename src/main/scala/org/{opennetworkinsight => spot}/dns/DNSWordCreation.scala (94%) rename src/main/scala/org/{opennetworkinsight => spot}/dns/model/DNSFeedback.scala (95%) rename src/main/scala/org/{opennetworkinsight => spot}/dns/model/DNSScoreFunction.scala (91%) rename src/main/scala/org/{opennetworkinsight => spot}/dns/model/DNSSuspiciousConnectsModel.scala (92%) rename src/main/scala/org/{opennetworkinsight => spot}/dns/sideinformation/DNSSideInformation.scala (82%) rename src/main/scala/org/{opennetworkinsight => spot}/dns/sideinformation/DNSSideInformationFunction.scala (92%) rename src/main/scala/org/{opennetworkinsight => spot}/netflow/FlowColumnIndex.scala (91%) rename src/main/scala/org/{opennetworkinsight => spot}/netflow/FlowPostLDA.scala (98%) rename src/main/scala/org/{opennetworkinsight => spot}/netflow/FlowPreLDA.scala (94%) rename src/main/scala/org/{opennetworkinsight => spot}/netflow/FlowSchema.scala (96%) rename src/main/scala/org/{opennetworkinsight => spot}/netflow/FlowSuspiciousConnects.scala (70%) rename src/main/scala/org/{opennetworkinsight => spot}/netflow/FlowWordCreation.scala (97%) rename src/main/scala/org/{opennetworkinsight => spot}/proxy/ProxyFeedback.scala (96%) rename src/main/scala/org/{opennetworkinsight => spot}/proxy/ProxySchema.scala (96%) rename src/main/scala/org/{opennetworkinsight => spot}/proxy/ProxySuspiciousConnectsAnalysis.scala (89%) rename src/main/scala/org/{opennetworkinsight => spot}/proxy/ProxySuspiciousConnectsModel.scala (89%) rename src/main/scala/org/{opennetworkinsight => spot}/proxy/ProxyWordCreation.scala (93%) rename src/main/scala/org/{opennetworkinsight => spot}/utilities/CountryCodes.scala (97%) rename src/main/scala/org/{opennetworkinsight => spot}/utilities/DataFrameUtils.scala (96%) rename src/main/scala/org/{opennetworkinsight => spot}/utilities/DomainProcessor.scala (99%) rename src/main/scala/org/{opennetworkinsight => spot}/utilities/Entropy.scala (91%) rename src/main/scala/org/{opennetworkinsight => spot}/utilities/Quantiles.scala (98%) rename src/main/scala/org/{opennetworkinsight => spot}/utilities/TimeUtilities.scala (87%) rename src/main/scala/org/{opennetworkinsight => spot}/utilities/TopDomains.scala (86%) rename src/test/scala/org/{opennetworkinsight => spot}/DNSWordCreationTest.scala (56%) rename src/test/scala/org/{opennetworkinsight => spot}/FlowWordCreationTest.scala (99%) rename src/test/scala/org/{opennetworkinsight => spot}/QuantilesTest.scala (98%) rename src/test/scala/org/{opennetworkinsight/OniLDACWrapperTest.scala => spot/SpotLDACWrapperTest.scala} (87%) rename src/test/scala/org/{opennetworkinsight => spot}/testutils/TestingSparkContext.scala (98%) rename src/test/scala/org/{opennetworkinsight => spot}/testutils/TestingSparkContextFlatSpec.scala (94%) rename src/test/scala/org/{opennetworkinsight => spot}/testutils/TestingSparkContextFunSuite.scala (94%) rename src/test/scala/org/{opennetworkinsight => spot}/testutils/TestingSparkContextWordSpec.scala (94%) rename src/test/scala/org/{opennetworkinsight => spot}/utilities/DomainProcessorTest.scala (96%) diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index d9cfe6a..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "oni-lda-c"] - path = oni-lda-c - url = https://github.com/OpenNetworkInsight/oni-lda-c diff --git a/INSTALL.md b/INSTALL.md index e03b6a8..0b7d5e5 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,17 +1,17 @@ -# oni-ml +# spot-ml Machine learning routines for OpenNetworkInsight, version 1.1 ## Prerequisites, Installation and Configuration -Install and configure oni-ml as a part of the Open-Network-Insight project, per the instruction at -[the Open-Network-Insight wiki](https://github.com/Open-Network-Insight/open-network-insight/wiki). +Install and configure spot-ml as a part of the Spot project, per the instruction at +[the Spot wiki]. -The oni-ml routines must be built into a jar stored at `target/scala-2.10/oni-ml-assembly-1.1.jar` on the master node. This requires Scala 2.10 or later to be installed on the system building the jar. To build the jar, from the top-level of the oni-ml repo, execute the command `sbt assembly`. +The spot-ml routines must be built into a jar stored at `target/scala-2.10/spot-ml-assembly-1.1.jar` on the master node. This requires Scala 2.10 or later to be installed on the system building the jar. To build the jar, from the top-level of the spot-ml repo, execute the command `sbt assembly`. -Names and language that we will use from the configuration variables for Open-Network-Insight (that are set in the file [duxbay.conf](https://github.com/Open-Network-Insight/oni-setup/blob/dev/duxbay.conf)) +Names and language that we will use from the configuration variables for Spot (that are set in the file [spot.conf]) -- MLNODE The node from which the oni-ml routines are invoked +- MLNODE The node from which the spot-ml routines are invoked - NODES The list of MPI worker nodes that execute the topic modelling analysis - HUSER An HDFS user path that will be the base path for the solution; this is usually the same user that you created to run the solution - LPATH The local path for the ML intermediate and final results, dynamically created and populated when the pipeline runs @@ -22,7 +22,7 @@ Names and language that we will use from the configuration variables for Open-Ne ### Prepare data for input -Load data for consumption by oni-ml by running [oni-ingest](https://github.com/Open-Network-Insight/oni-ingest/tree/dev). +Load data for consumption by spot-ml by running [spot-ingest]. ## Run a suspicious connects analysis @@ -52,7 +52,7 @@ As the maximum probability of an event is 1, a threshold of 1 can be used to sel ## Licensing -oni-ml is licensed under Apache Version 2.0 +spot-ml is licensed under Apache Version 2.0 ## Contributing @@ -60,7 +60,7 @@ Create a pull request and contact the maintainers. ## Issues -Report issues at the [OpenNetworkInsight issues page](https://github.com/Open-Network-Insight/open-network-insight/issues). +Report issues at the [Spot issues page]. ## Maintainers diff --git a/README.md b/README.md index 0c73b8b..f3b365f 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,19 @@ -# oni-ml +# spot-ml Machine learning routines for OpenNetworkInsight, version 1.1 -At present, oni-ml contains routines for performing *suspicious connections* analyses on netflow, DNS or proxy data gathered from a network. These +At present, spot-ml contains routines for performing *suspicious connections* analyses on netflow, DNS or proxy data gathered from a network. These analyses consume a (possibly very lage) collection of network events and produces a list of the events that considered to be the least probable (or most suspicious). -oni-ml is designed to be run as a component of Open-Network-Insight. It relies on the ingest component of Open-Network-Insight to collect and load -netflow and DNS records, and oni-ml will try to load data to the operational analytics component of Open-Network-Insight. It is strongly suggested that when experimenting with oni-ml, you do so as a part of the unified Open-Network-Insight system: Please see [the Open-Network-Insight wiki](https://github.com/Open-Network-Insight/open-network-insight/wiki) +spot-ml is designed to be run as a component of Spot. It relies on the ingest component of Spot to collect and load +netflow and DNS records, and spot-ml will try to load data to the operational analytics component of Spot. It is suggested that when experimenting with spot-ml, you do so as a part of the unified Spot system: Please see [the Spot wiki] -The remaining instructions in this README file treat oni-ml in a stand-alone fashion that might be helpful for customizing and troubleshooting the +The remaining instructions in this README file treat spot-ml in a stand-alone fashion that might be helpful for customizing and troubleshooting the component. ## Prepare data for input -Load data for consumption by oni-ml by running [oni-ingest](https://github.com/Open-Network-Insight/oni-ingest/tree/dev). +Load data for consumption by spot-ml by running [spot-ingest]. The data format and location where the data is stored differs for netflow and DNS analyses. @@ -21,7 +21,7 @@ The data format and location where the data is stored differs for netflow and DN Netflow data for the year YEAR, month MONTH, and day DAY is stored in HDFS at `HUSER/flow/csv/y=YEAR/m=MONTH/d=DAY/*` -Data for oni-ml netflow analyses is currently stored in text csv files using the following schema: +Data for spot-ml netflow analyses is currently stored in text csv files using the following schema: - time: String - year: Double @@ -55,7 +55,7 @@ Data for oni-ml netflow analyses is currently stored in text csv files using the DNS data for the year YEAR, month MONTH and day DAY is stored in Hive at `HUSER/dns/hive/y=YEAR/m=MONTH/d=DAY/` -The Hive tables containing DNS data for oni-ml analyses have the following schema: +The Hive tables containing DNS data for spot-ml analyses have the following schema: - frame_time: STRING - unix_tstamp: BIGINT @@ -124,12 +124,12 @@ As the maximum probability of an event is 1, a threshold of 1 can be used to sel ``` -### oni-ml output +### spot-ml output Final results are stored in the following file on HDFS. Depending on which data source is analyzed, -oni-ml output will be found under the ``HPATH`` at one of +spot-ml output will be found under the ``HPATH`` at one of $HPATH/dns/scored_results/YYYYMMDD/scores/dns_results.csv $HPATH/proxy/scored_results/YYYYMMDD/scores/results.csv @@ -138,7 +138,7 @@ oni-ml output will be found under the ``HPATH`` at one of It is a csv file in which network events annotated with estimated probabilities and sorted in ascending order. -A successful run of oni-ml will also create and populate a directory at `LPATH//YYYYMMDD` where `` is one of flow, dns or proxy, and +A successful run of spot-ml will also create and populate a directory at `LPATH//YYYYMMDD` where `` is one of flow, dns or proxy, and `YYYYMMDD` is the date argument provided to `ml_ops.sh` This directory will contain the following files generated during the LDA procedure used for topic-modelling: @@ -152,7 +152,7 @@ In addition, on each worker node identified in NODES, in the `LPATH//YYY ## Licensing -oni-ml is licensed under Apache Version 2.0 +spot-ml is licensed under Apache Version 2.0 ## Contributing @@ -160,7 +160,7 @@ Create a pull request and contact the maintainers. ## Issues -Report issues at the [OpenNetworkInsight issues page](https://github.com/Open-Network-Insight/open-network-insight/issues). +Report issues at the [Spot issues page]. ## Maintainers diff --git a/build.sbt b/build.sbt index 3248490..4ed5884 100644 --- a/build.sbt +++ b/build.sbt @@ -1,4 +1,4 @@ -name := "oni-ml" +name := "spot-ml" version := "1.1" diff --git a/install_ml.sh b/install_ml.sh index 3e29545..ef92345 100755 --- a/install_ml.sh +++ b/install_ml.sh @@ -1,13 +1,12 @@ #!/bin/bash - # lda stage -source /etc/duxbay.conf +source /etc/spot.conf # copy solution files to all nodes for d in "${NODES[@]}" do - rsync -v -a --include='target' --include='target/scala-2.10' --include='target/scala-2.10/oni-ml-assembly-1.1.jar' \ - --include='oni-lda-c' --include='oni-lda-c/*' --include 'top-1m.csv' --include='*.sh' \ + rsync -v -a --include='target' --include='target/scala-2.10' --include='target/scala-2.10/spot-ml-assembly-1.1.jar' \ + --include='spot-lda-c' --include='spot-lda-c/*' --include 'top-1m.csv' --include='*.sh' \ --exclude='*' . $d:${LUSER}/ml done diff --git a/ml_ops.sh b/ml_ops.sh index 4135a1e..d60cd2c 100755 --- a/ml_ops.sh +++ b/ml_ops.sh @@ -23,7 +23,7 @@ fi # read in variables (except for date) from etc/.conf file # note: FDATE and DSOURCE *must* be defined prior sourcing this conf file -source /etc/duxbay.conf +source /etc/spot.conf # third argument if present will override default TOL from conf file @@ -78,7 +78,7 @@ rm -f ${LPATH}/*.{dat,beta,gamma,other,pkl} # protect the flow_scores.csv file hdfs dfs -rm -R -f ${HDFS_SCORED_CONNECTS} # Add -p to execute pre MPI command. -# Pre MPI command can be configured in /etc/duxbay.conf +# Pre MPI command can be configured in /etc/spot.conf # In this script, after the line after --mpicmd ${MPI_CMD} add: # --mpiprep ${MPI_PREP_CMD} @@ -101,7 +101,7 @@ time spark-submit --class "org.opennetworkinsight.SuspiciousConnects" \ --conf spark.shuffle.service.enabled=true \ --conf spark.yarn.am.waitTime=1000000 \ --conf spark.yarn.driver.memoryOverhead=${SPK_DRIVER_MEM_OVERHEAD} \ - --conf spark.yarn.executor.memoryOverhead=${SPAK_EXEC_MEM_OVERHEAD} target/scala-2.10/oni-ml-assembly-1.1.jar \ + --conf spark.yarn.executor.memoryOverhead=${SPAK_EXEC_MEM_OVERHEAD} target/scala-2.10/spot-ml-assembly-1.1.jar \ --analysis ${DSOURCE} \ --input ${RAWDATA_PATH} \ --dupfactor ${DUPFACTOR} \ diff --git a/src/main/scala/org/opennetworkinsight/OniLDACWrapper.scala b/src/main/scala/org/spot/SpotLDACWrapper.scala similarity index 87% rename from src/main/scala/org/opennetworkinsight/OniLDACWrapper.scala rename to src/main/scala/org/spot/SpotLDACWrapper.scala index 1a7664e..8a7f405 100644 --- a/src/main/scala/org/opennetworkinsight/OniLDACWrapper.scala +++ b/src/main/scala/org/spot/SpotLDACWrapper.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight +package org.spot import org.apache.spark.rdd.RDD import java.io.PrintWriter @@ -15,14 +15,14 @@ import scala.sys.process._ * 4. Calculates and returns probability of word given topic: p(w|z) */ -object OniLDACWrapper { +object SpotLDACWrapper { - case class OniLDACInput(doc: String, word: String, count: Int) extends Serializable + case class SpotLDACInput(doc: String, word: String, count: Int) extends Serializable - case class OniLDACOutput(docToTopicMix: Map[String, Array[Double]], wordResults: Map[String, Array[Double]]) + case class SpotLDACOutput(docToTopicMix: Map[String, Array[Double]], wordResults: Map[String, Array[Double]]) - def runLDA(docWordCount: RDD[OniLDACInput], + def runLDA(docWordCount: RDD[SpotLDACInput], modelFile: String, topicDocumentFile: String, topicWordFile: String, @@ -35,19 +35,19 @@ object OniLDACWrapper { localUser: String, dataSource: String, nodes: String, - prgSeed: Option[Long]): OniLDACOutput = { + prgSeed: Option[Long]): SpotLDACOutput = { // Create word Map Word,Index for further usage val wordDictionary: Map[String, Int] = { val words = docWordCount .cache - .map({case OniLDACInput(doc, word, count) => word}) + .map({case SpotLDACInput(doc, word, count) => word}) .distinct .collect words.zipWithIndex.toMap } - val distinctDocument = docWordCount.map({case OniLDACInput(doc, word, count) => doc}).distinct.collect + val distinctDocument = docWordCount.map({case SpotLDACInput(doc, word, count) => doc}).distinct.collect //distinctDocument.cache() // Create document Map Index, Document for further usage @@ -113,7 +113,7 @@ object OniLDACWrapper { // Create word results val wordResults = getWordToProbPerTopicMap(topicWordData, wordDictionary) - OniLDACOutput(docToTopicMix, wordResults) + SpotLDACOutput(docToTopicMix, wordResults) } /** @@ -147,18 +147,18 @@ object OniLDACWrapper { } } - def createModel(documentWordData: RDD[OniLDACInput], wordToIndex: Map[String, Int], distinctDocument: Array[String]) + def createModel(documentWordData: RDD[SpotLDACInput], wordToIndex: Map[String, Int], distinctDocument: Array[String]) : Array[String] = { val documentCount = documentWordData - .map({case OniLDACInput(doc, word, count) => doc}) + .map({case SpotLDACInput(doc, word, count) => doc}) .map(document => (document, 1)) .reduceByKey(_ + _) .collect .toMap val wordIndexdocWordCount = documentWordData - .map({case OniLDACInput(doc, word, count) => (doc, wordToIndex(word) + ":" + count)}) + .map({case SpotLDACInput(doc, word, count) => (doc, wordToIndex(word) + ":" + count)}) .groupByKey() .map(x => (x._1, x._2.mkString(" "))) .collect diff --git a/src/main/scala/org/opennetworkinsight/SuspiciousConnects.scala b/src/main/scala/org/spot/SuspiciousConnects.scala similarity index 80% rename from src/main/scala/org/opennetworkinsight/SuspiciousConnects.scala rename to src/main/scala/org/spot/SuspiciousConnects.scala index 9f8e37f..d948f62 100644 --- a/src/main/scala/org/opennetworkinsight/SuspiciousConnects.scala +++ b/src/main/scala/org/spot/SuspiciousConnects.scala @@ -1,12 +1,12 @@ -package org.opennetworkinsight +package org.spot import org.apache.log4j.{Level, LogManager, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} -import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.opennetworkinsight.dns.DNSSuspiciousConnectsAnalysis -import org.opennetworkinsight.netflow.FlowSuspiciousConnects -import org.opennetworkinsight.proxy.ProxySuspiciousConnectsAnalysis +import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.spot.dns.DNSSuspiciousConnectsAnalysis +import org.spot.netflow.FlowSuspiciousConnects +import org.spot.proxy.ProxySuspiciousConnectsAnalysis /** * Top level entrypoint to execute suspicious connections analysis on network data. @@ -39,7 +39,7 @@ object SuspiciousConnects { Logger.getLogger("akka").setLevel(Level.OFF) val analysis = config.analysis - val sparkConfig = new SparkConf().setAppName("ONI ML: " + analysis + " lda") + val sparkConfig = new SparkConf().setAppName("Spot ML: " + analysis + " suspicious connects analysis") val sparkContext = new SparkContext(sparkConfig) val sqlContext = new SQLContext(sparkContext) implicit val outputDelimiter = OutputDelimiter diff --git a/src/main/scala/org/opennetworkinsight/SuspiciousConnectsArgumentParser.scala b/src/main/scala/org/spot/SuspiciousConnectsArgumentParser.scala similarity index 99% rename from src/main/scala/org/opennetworkinsight/SuspiciousConnectsArgumentParser.scala rename to src/main/scala/org/spot/SuspiciousConnectsArgumentParser.scala index 793d18a..f8ffeaa 100644 --- a/src/main/scala/org/opennetworkinsight/SuspiciousConnectsArgumentParser.scala +++ b/src/main/scala/org/spot/SuspiciousConnectsArgumentParser.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight +package org.spot /** diff --git a/src/main/scala/org/opennetworkinsight/SuspiciousConnectsScoreFunction.scala b/src/main/scala/org/spot/SuspiciousConnectsScoreFunction.scala similarity index 96% rename from src/main/scala/org/opennetworkinsight/SuspiciousConnectsScoreFunction.scala rename to src/main/scala/org/spot/SuspiciousConnectsScoreFunction.scala index a358708..549fee6 100644 --- a/src/main/scala/org/opennetworkinsight/SuspiciousConnectsScoreFunction.scala +++ b/src/main/scala/org/spot/SuspiciousConnectsScoreFunction.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight +package org.spot import org.apache.spark.broadcast.Broadcast diff --git a/src/main/scala/org/opennetworkinsight/dns/DNSSchema.scala b/src/main/scala/org/spot/dns/DNSSchema.scala similarity index 94% rename from src/main/scala/org/opennetworkinsight/dns/DNSSchema.scala rename to src/main/scala/org/spot/dns/DNSSchema.scala index 1435468..97b9431 100644 --- a/src/main/scala/org/opennetworkinsight/dns/DNSSchema.scala +++ b/src/main/scala/org/spot/dns/DNSSchema.scala @@ -1,8 +1,8 @@ -package org.opennetworkinsight.dns +package org.spot.dns import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ -import org.opennetworkinsight.dns.model.DNSSuspiciousConnectsModel.ModelSchema +import org.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema /** * Data frame schemas and column names used in the DNS suspicious connects analysis. diff --git a/src/main/scala/org/opennetworkinsight/dns/DNSSuspiciousConnectsAnalysis.scala b/src/main/scala/org/spot/dns/DNSSuspiciousConnectsAnalysis.scala similarity index 85% rename from src/main/scala/org/opennetworkinsight/dns/DNSSuspiciousConnectsAnalysis.scala rename to src/main/scala/org/spot/dns/DNSSuspiciousConnectsAnalysis.scala index 0c096cf..e08c5c7 100644 --- a/src/main/scala/org/opennetworkinsight/dns/DNSSuspiciousConnectsAnalysis.scala +++ b/src/main/scala/org/spot/dns/DNSSuspiciousConnectsAnalysis.scala @@ -1,17 +1,17 @@ -package org.opennetworkinsight.dns +package org.spot.dns import org.apache.spark.SparkContext import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, SQLContext} -import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.opennetworkinsight.dns.DNSSchema._ -import org.opennetworkinsight.dns.model.DNSSuspiciousConnectsModel -import org.opennetworkinsight.dns.sideinformation.DNSSideInformation +import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.spot.dns.DNSSchema._ +import org.spot.dns.model.DNSSuspiciousConnectsModel +import org.spot.dns.sideinformation.DNSSideInformation import org.apache.log4j.Logger -import org.opennetworkinsight.dns.model.DNSSuspiciousConnectsModel.ModelSchema -import org.opennetworkinsight.dns.sideinformation.DNSSideInformation.SideInfoSchema +import org.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema +import org.spot.dns.sideinformation.DNSSideInformation.SideInfoSchema /** * The suspicious connections analysis of DNS log data develops a probabilistic model the DNS queries diff --git a/src/main/scala/org/opennetworkinsight/dns/DNSWordCreation.scala b/src/main/scala/org/spot/dns/DNSWordCreation.scala similarity index 94% rename from src/main/scala/org/opennetworkinsight/dns/DNSWordCreation.scala rename to src/main/scala/org/spot/dns/DNSWordCreation.scala index afda928..544b31f 100644 --- a/src/main/scala/org/opennetworkinsight/dns/DNSWordCreation.scala +++ b/src/main/scala/org/spot/dns/DNSWordCreation.scala @@ -1,9 +1,9 @@ -package org.opennetworkinsight.dns +package org.spot.dns import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.functions._ -import org.opennetworkinsight.utilities.DomainProcessor.{DomainInfo, extractDomainInfo} -import org.opennetworkinsight.utilities.Quantiles +import org.spot.utilities.DomainProcessor.{DomainInfo, extractDomainInfo} +import org.spot.utilities.Quantiles /** diff --git a/src/main/scala/org/opennetworkinsight/dns/model/DNSFeedback.scala b/src/main/scala/org/spot/dns/model/DNSFeedback.scala similarity index 95% rename from src/main/scala/org/opennetworkinsight/dns/model/DNSFeedback.scala rename to src/main/scala/org/spot/dns/model/DNSFeedback.scala index d27cd0c..9048e97 100644 --- a/src/main/scala/org/opennetworkinsight/dns/model/DNSFeedback.scala +++ b/src/main/scala/org/spot/dns/model/DNSFeedback.scala @@ -1,9 +1,9 @@ -package org.opennetworkinsight.dns.model +package org.spot.dns.model import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.opennetworkinsight.dns.model.DNSSuspiciousConnectsModel.{ModelSchema, modelColumns} +import org.spot.dns.model.DNSSuspiciousConnectsModel.{ModelSchema, modelColumns} import scala.io.Source diff --git a/src/main/scala/org/opennetworkinsight/dns/model/DNSScoreFunction.scala b/src/main/scala/org/spot/dns/model/DNSScoreFunction.scala similarity index 91% rename from src/main/scala/org/opennetworkinsight/dns/model/DNSScoreFunction.scala rename to src/main/scala/org/spot/dns/model/DNSScoreFunction.scala index be97da8..3129e8a 100644 --- a/src/main/scala/org/opennetworkinsight/dns/model/DNSScoreFunction.scala +++ b/src/main/scala/org/spot/dns/model/DNSScoreFunction.scala @@ -1,8 +1,8 @@ -package org.opennetworkinsight.dns.model +package org.spot.dns.model import org.apache.spark.broadcast.Broadcast -import org.opennetworkinsight.SuspiciousConnectsScoreFunction -import org.opennetworkinsight.dns.DNSWordCreation +import org.spot.SuspiciousConnectsScoreFunction +import org.spot.dns.DNSWordCreation /** diff --git a/src/main/scala/org/opennetworkinsight/dns/model/DNSSuspiciousConnectsModel.scala b/src/main/scala/org/spot/dns/model/DNSSuspiciousConnectsModel.scala similarity index 92% rename from src/main/scala/org/opennetworkinsight/dns/model/DNSSuspiciousConnectsModel.scala rename to src/main/scala/org/spot/dns/model/DNSSuspiciousConnectsModel.scala index 3d0f301..d8c289b 100644 --- a/src/main/scala/org/opennetworkinsight/dns/model/DNSSuspiciousConnectsModel.scala +++ b/src/main/scala/org/spot/dns/model/DNSSuspiciousConnectsModel.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.dns.model +package org.spot.dns.model import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast @@ -6,13 +6,13 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.opennetworkinsight.OniLDACWrapper -import org.opennetworkinsight.OniLDACWrapper.{OniLDACInput, OniLDACOutput} -import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.opennetworkinsight.dns.DNSSchema._ -import org.opennetworkinsight.dns.DNSWordCreation -import org.opennetworkinsight.utilities.{CountryCodes, DomainProcessor, Quantiles, TopDomains} -import org.opennetworkinsight.utilities.DomainProcessor.DomainInfo +import org.spot.SpotLDACWrapper +import org.spot.SpotLDACWrapper.{SpotLDACInput, SpotLDACOutput} +import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.spot.dns.DNSSchema._ +import org.spot.dns.DNSWordCreation +import org.spot.utilities.{CountryCodes, DomainProcessor, Quantiles, TopDomains} +import org.spot.utilities.DomainProcessor.DomainInfo import org.apache.log4j.Logger @@ -64,7 +64,7 @@ class DNSSuspiciousConnectsModel(inTopicCount: Int, * @param sc Spark Context * @param sqlContext Spark SQL context * @param inDF Dataframe of DNS log events, containing at least the columns of [[DNSSuspiciousConnectsModel.ModelSchema]] - * @return Dataframe with a column named [[org.opennetworkinsight.dns.DNSSchema.Score]] that contains the + * @return Dataframe with a column named [[org.spot.dns.DNSSchema.Score]] that contains the * probability estimated for the network event at that row */ def score(sc: SparkContext, sqlContext: SQLContext, inDF: DataFrame): DataFrame = { @@ -190,10 +190,10 @@ object DNSSuspiciousConnectsModel { val ipDstWordCounts = dataWithWordDF.select(ClientIP, Word).map({ case Row(destIP: String, word: String) => (destIP, word) -> 1 }) .reduceByKey(_ + _) - .map({ case ((ipDst, word), count) => OniLDACInput(ipDst, word, count) }) + .map({ case ((ipDst, word), count) => SpotLDACInput(ipDst, word, count) }) - val OniLDACOutput(ipToTopicMix, wordToPerTopicProb) = OniLDACWrapper.runLDA(ipDstWordCounts, + val SpotLDACOutput(ipToTopicMix, wordToPerTopicProb) = SpotLDACWrapper.runLDA(ipDstWordCounts, config.modelFile, config.topicDocumentFile, config.topicWordFile, diff --git a/src/main/scala/org/opennetworkinsight/dns/sideinformation/DNSSideInformation.scala b/src/main/scala/org/spot/dns/sideinformation/DNSSideInformation.scala similarity index 82% rename from src/main/scala/org/opennetworkinsight/dns/sideinformation/DNSSideInformation.scala rename to src/main/scala/org/spot/dns/sideinformation/DNSSideInformation.scala index 88235d6..1d4f49e 100644 --- a/src/main/scala/org/opennetworkinsight/dns/sideinformation/DNSSideInformation.scala +++ b/src/main/scala/org/spot/dns/sideinformation/DNSSideInformation.scala @@ -1,18 +1,19 @@ -package org.opennetworkinsight.dns.sideinformation +package org.spot.dns.sideinformation import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.opennetworkinsight.dns.DNSSchema._ -import org.opennetworkinsight.dns.model.DNSSuspiciousConnectsModel -import org.opennetworkinsight.utilities.{CountryCodes, TopDomains} +import org.spot.dns.DNSSchema._ +import org.spot.dns.model.DNSSuspiciousConnectsModel +import org.spot.utilities.{CountryCodes, TopDomains} /** * Create the side information required by the OA layer. - * @param model An instance of [[org.opennetworkinsight.dns.model.DNSSuspiciousConnectsModel]] + * + * @param model An instance of [[org.spot.dns.model.DNSSuspiciousConnectsModel]] * Necessary because much of the side information is auxilliary information used to * construct the model. */ @@ -21,10 +22,11 @@ class DNSSideInformation(model: DNSSuspiciousConnectsModel) { /** * Add side information to a dataframe. + * * @param sparkContext Spark context. - * @param sqlContext Spark SQL context. - * @param inDF dataframe containing at least the rows of - * [[org.opennetworkinsight.dns.model.DNSSuspiciousConnectsModel.ModelSchema]] + * @param sqlContext Spark SQL context. + * @param inDF dataframe containing at least the rows of + * [[org.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema]] * @return copy of the dataframe with the columsns [[DNSSideInformation.SideInfoSchema]] added */ def addSideInformationForOA(sparkContext: SparkContext, @@ -73,6 +75,7 @@ object DNSSideInformation { /** * The per DNS log entry values in the side information. + * * @param domain * @param topDomain * @param subdomain diff --git a/src/main/scala/org/opennetworkinsight/dns/sideinformation/DNSSideInformationFunction.scala b/src/main/scala/org/spot/dns/sideinformation/DNSSideInformationFunction.scala similarity index 92% rename from src/main/scala/org/opennetworkinsight/dns/sideinformation/DNSSideInformationFunction.scala rename to src/main/scala/org/spot/dns/sideinformation/DNSSideInformationFunction.scala index 38e41f0..44d58ff 100644 --- a/src/main/scala/org/opennetworkinsight/dns/sideinformation/DNSSideInformationFunction.scala +++ b/src/main/scala/org/spot/dns/sideinformation/DNSSideInformationFunction.scala @@ -1,12 +1,13 @@ -package org.opennetworkinsight.dns.sideinformation +package org.spot.dns.sideinformation import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.Row -import org.opennetworkinsight.dns.DNSWordCreation -import org.opennetworkinsight.utilities.DomainProcessor.{DomainInfo, extractDomainInfo} +import org.spot.dns.DNSWordCreation +import org.spot.utilities.DomainProcessor.{DomainInfo, extractDomainInfo} /** * Add side information for OA to a dataframe. + * * @param fieldNames * @param frameLengthCuts * @param timeCuts diff --git a/src/main/scala/org/opennetworkinsight/netflow/FlowColumnIndex.scala b/src/main/scala/org/spot/netflow/FlowColumnIndex.scala similarity index 91% rename from src/main/scala/org/opennetworkinsight/netflow/FlowColumnIndex.scala rename to src/main/scala/org/spot/netflow/FlowColumnIndex.scala index 3d17cb2..a3aede2 100644 --- a/src/main/scala/org/opennetworkinsight/netflow/FlowColumnIndex.scala +++ b/src/main/scala/org/spot/netflow/FlowColumnIndex.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.netflow +package org.spot.netflow object FlowColumnIndex extends Enumeration { val HOUR = 4 diff --git a/src/main/scala/org/opennetworkinsight/netflow/FlowPostLDA.scala b/src/main/scala/org/spot/netflow/FlowPostLDA.scala similarity index 98% rename from src/main/scala/org/opennetworkinsight/netflow/FlowPostLDA.scala rename to src/main/scala/org/spot/netflow/FlowPostLDA.scala index ebfff6c..06b5fc1 100644 --- a/src/main/scala/org/opennetworkinsight/netflow/FlowPostLDA.scala +++ b/src/main/scala/org/spot/netflow/FlowPostLDA.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.netflow +package org.spot.netflow import org.apache.log4j.Logger import org.apache.spark.SparkContext @@ -6,7 +6,7 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.opennetworkinsight.netflow.FlowSchema._ +import org.spot.netflow.FlowSchema._ /** * Contains routines for scoring incoming netflow records from a netflow suspicious connections model. diff --git a/src/main/scala/org/opennetworkinsight/netflow/FlowPreLDA.scala b/src/main/scala/org/spot/netflow/FlowPreLDA.scala similarity index 94% rename from src/main/scala/org/opennetworkinsight/netflow/FlowPreLDA.scala rename to src/main/scala/org/spot/netflow/FlowPreLDA.scala index 067aa9c..ffdc907 100755 --- a/src/main/scala/org/opennetworkinsight/netflow/FlowPreLDA.scala +++ b/src/main/scala/org/spot/netflow/FlowPreLDA.scala @@ -1,12 +1,12 @@ -package org.opennetworkinsight.netflow +package org.spot.netflow import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.opennetworkinsight.OniLDACWrapper.OniLDACInput -import org.opennetworkinsight.netflow.FlowSchema._ +import org.spot.SpotLDACWrapper.SpotLDACInput +import org.spot.netflow.FlowSchema._ import scala.io.Source @@ -17,7 +17,7 @@ import scala.io.Source object FlowPreLDA { def flowPreLDA(inputPath: String, scoresFile: String, duplicationFactor: Int, - sc: SparkContext, sqlContext: SQLContext, logger: Logger ): RDD[OniLDACInput] = { + sc: SparkContext, sqlContext: SQLContext, logger: Logger ): RDD[SpotLDACInput] = { logger.info("Flow pre LDA starts") @@ -136,7 +136,7 @@ object FlowPreLDA { .map({ case Row(destinationIp: String, destinationWord: String) => (destinationIp, destinationWord) -> 1 }) .reduceByKey(_ + _) - val word_counts = sc.union(src_word_counts, dest_word_counts).map({case ((ip, word), count) => OniLDACInput(ip, word, count)}) + val word_counts = sc.union(src_word_counts, dest_word_counts).map({case ((ip, word), count) => SpotLDACInput(ip, word, count)}) logger.info("Flow pre LDA completed") word_counts diff --git a/src/main/scala/org/opennetworkinsight/netflow/FlowSchema.scala b/src/main/scala/org/spot/netflow/FlowSchema.scala similarity index 96% rename from src/main/scala/org/opennetworkinsight/netflow/FlowSchema.scala rename to src/main/scala/org/spot/netflow/FlowSchema.scala index 06de55c..67be4a9 100644 --- a/src/main/scala/org/opennetworkinsight/netflow/FlowSchema.scala +++ b/src/main/scala/org/spot/netflow/FlowSchema.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.netflow +package org.spot.netflow object FlowSchema { diff --git a/src/main/scala/org/opennetworkinsight/netflow/FlowSuspiciousConnects.scala b/src/main/scala/org/spot/netflow/FlowSuspiciousConnects.scala similarity index 70% rename from src/main/scala/org/opennetworkinsight/netflow/FlowSuspiciousConnects.scala rename to src/main/scala/org/spot/netflow/FlowSuspiciousConnects.scala index 9bc58c9..132768f 100644 --- a/src/main/scala/org/opennetworkinsight/netflow/FlowSuspiciousConnects.scala +++ b/src/main/scala/org/spot/netflow/FlowSuspiciousConnects.scala @@ -1,11 +1,11 @@ -package org.opennetworkinsight.netflow +package org.spot.netflow import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext -import org.opennetworkinsight.OniLDACWrapper -import org.opennetworkinsight.OniLDACWrapper.OniLDACOutput -import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.spot.SpotLDACWrapper +import org.spot.SpotLDACWrapper.SpotLDACOutput +import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig object FlowSuspiciousConnects { @@ -16,7 +16,7 @@ object FlowSuspiciousConnects { val docWordCount = FlowPreLDA.flowPreLDA(config.inputPath, config.scoresFile, config.duplicationFactor, sparkContext, sqlContext, logger) - val OniLDACOutput(documentResults, wordResults) = OniLDACWrapper.runLDA(docWordCount, config.modelFile, config.topicDocumentFile, config.topicWordFile, + val SpotLDACOutput(documentResults, wordResults) = SpotLDACWrapper.runLDA(docWordCount, config.modelFile, config.topicDocumentFile, config.topicWordFile, config.mpiPreparationCmd, config.mpiCmd, config.mpiProcessCount, config.topicCount, config.localPath, config.ldaPath, config.localUser, config.analysis, config.nodes, config.ldaPRGSeed) diff --git a/src/main/scala/org/opennetworkinsight/netflow/FlowWordCreation.scala b/src/main/scala/org/spot/netflow/FlowWordCreation.scala similarity index 97% rename from src/main/scala/org/opennetworkinsight/netflow/FlowWordCreation.scala rename to src/main/scala/org/spot/netflow/FlowWordCreation.scala index 85d655a..b37a817 100644 --- a/src/main/scala/org/opennetworkinsight/netflow/FlowWordCreation.scala +++ b/src/main/scala/org/spot/netflow/FlowWordCreation.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.netflow +package org.spot.netflow import org.apache.log4j.Logger import org.apache.spark.SparkContext @@ -6,8 +6,8 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.opennetworkinsight.netflow.{FlowSchema => Schema} -import org.opennetworkinsight.utilities.Quantiles +import org.spot.netflow.{FlowSchema => Schema} +import org.spot.utilities.Quantiles object FlowWordCreation { diff --git a/src/main/scala/org/opennetworkinsight/proxy/ProxyFeedback.scala b/src/main/scala/org/spot/proxy/ProxyFeedback.scala similarity index 96% rename from src/main/scala/org/opennetworkinsight/proxy/ProxyFeedback.scala rename to src/main/scala/org/spot/proxy/ProxyFeedback.scala index 2281701..0f2eede 100644 --- a/src/main/scala/org/opennetworkinsight/proxy/ProxyFeedback.scala +++ b/src/main/scala/org/spot/proxy/ProxyFeedback.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.proxy +package org.spot.proxy import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD @@ -6,13 +6,14 @@ import org.apache.spark.sql._ import org.apache.spark.sql.types.{StructType, StructField, StringType} import scala.io.Source -import org.opennetworkinsight.proxy.ProxySchema._ +import org.spot.proxy.ProxySchema._ object ProxyFeedback { /** * Load the feedback file for proxy data. + * * @param sc Spark context. * @param sqlContext Spark SQL context. * @param feedbackFile Local machine path to the proxy feedback file. diff --git a/src/main/scala/org/opennetworkinsight/proxy/ProxySchema.scala b/src/main/scala/org/spot/proxy/ProxySchema.scala similarity index 96% rename from src/main/scala/org/opennetworkinsight/proxy/ProxySchema.scala rename to src/main/scala/org/spot/proxy/ProxySchema.scala index 3d8c144..f7a6716 100644 --- a/src/main/scala/org/opennetworkinsight/proxy/ProxySchema.scala +++ b/src/main/scala/org/spot/proxy/ProxySchema.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.proxy +package org.spot.proxy /** * Data frame column names used in the proxy suspicious connects analysis. diff --git a/src/main/scala/org/opennetworkinsight/proxy/ProxySuspiciousConnectsAnalysis.scala b/src/main/scala/org/spot/proxy/ProxySuspiciousConnectsAnalysis.scala similarity index 89% rename from src/main/scala/org/opennetworkinsight/proxy/ProxySuspiciousConnectsAnalysis.scala rename to src/main/scala/org/spot/proxy/ProxySuspiciousConnectsAnalysis.scala index d9b7e3d..ebc0032 100644 --- a/src/main/scala/org/opennetworkinsight/proxy/ProxySuspiciousConnectsAnalysis.scala +++ b/src/main/scala/org/spot/proxy/ProxySuspiciousConnectsAnalysis.scala @@ -1,11 +1,11 @@ -package org.opennetworkinsight.proxy +package org.spot.proxy import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext -import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.opennetworkinsight.proxy.ProxySchema._ -import org.opennetworkinsight.utilities.DataFrameUtils +import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.spot.proxy.ProxySchema._ +import org.spot.utilities.DataFrameUtils /** * Run suspicious connections analysis on proxy data. diff --git a/src/main/scala/org/opennetworkinsight/proxy/ProxySuspiciousConnectsModel.scala b/src/main/scala/org/spot/proxy/ProxySuspiciousConnectsModel.scala similarity index 89% rename from src/main/scala/org/opennetworkinsight/proxy/ProxySuspiciousConnectsModel.scala rename to src/main/scala/org/spot/proxy/ProxySuspiciousConnectsModel.scala index a7816e4..5be09de 100644 --- a/src/main/scala/org/opennetworkinsight/proxy/ProxySuspiciousConnectsModel.scala +++ b/src/main/scala/org/spot/proxy/ProxySuspiciousConnectsModel.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.proxy +package org.spot.proxy import org.apache.log4j.Logger import org.apache.spark.SparkContext @@ -6,11 +6,11 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.opennetworkinsight.OniLDACWrapper.{OniLDACInput, OniLDACOutput} -import org.opennetworkinsight.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.opennetworkinsight.proxy.ProxySchema._ -import org.opennetworkinsight.utilities._ -import org.opennetworkinsight.{OniLDACWrapper, SuspiciousConnectsScoreFunction} +import org.spot.SpotLDACWrapper.{SpotLDACInput, SpotLDACOutput} +import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.spot.proxy.ProxySchema._ +import org.spot.utilities._ +import org.spot.{SpotLDACWrapper, SuspiciousConnectsScoreFunction} /** * Encapsulation of a proxy suspicious connections model. @@ -116,11 +116,11 @@ object ProxySuspiciousConnectsModel { val agentCuts = Quantiles.computeQuintiles(df.select(UserAgent).rdd.map({ case Row(agent: String) => agentToCountBC.value(agent) })) - val docWordCount: RDD[OniLDACInput] = + val docWordCount: RDD[SpotLDACInput] = getIPWordCounts(sparkContext, sqlContext, logger, df, config.scoresFile, config.duplicationFactor, agentToCount, timeCuts, entropyCuts, agentCuts) - val OniLDACOutput(documentResults, wordResults) = OniLDACWrapper.runLDA(docWordCount, + val SpotLDACOutput(documentResults, wordResults) = SpotLDACWrapper.runLDA(docWordCount, config.modelFile, config.topicDocumentFile, config.topicWordFile, @@ -140,9 +140,9 @@ object ProxySuspiciousConnectsModel { /** * Transform proxy log events into summarized words and aggregate into IP-word counts. - * Returned as OniLDACInput objects. + * Returned as [[SpotLDACInput]] objects. * - * @return RDD of OniLDACInput objects containing the aggregated IP-word counts. + * @return RDD of [[SpotLDACInput]] objects containing the aggregated IP-word counts. */ def getIPWordCounts(sc: SparkContext, sqlContext: SQLContext, @@ -153,7 +153,7 @@ object ProxySuspiciousConnectsModel { agentToCount: Map[String, Long], timeCuts: Array[Double], entropyCuts: Array[Double], - agentCuts: Array[Double]): RDD[OniLDACInput] = { + agentCuts: Array[Double]): RDD[SpotLDACInput] = { logger.info("Read source data") @@ -170,7 +170,7 @@ object ProxySuspiciousConnectsModel { agentToCount: Map[String, Long], timeCuts: Array[Double], entropyCuts: Array[Double], - agentCuts: Array[Double]): RDD[OniLDACInput] = { + agentCuts: Array[Double]): RDD[SpotLDACInput] = { val topDomains: Broadcast[Set[String]] = sc.broadcast(TopDomains.TopDomains) @@ -188,6 +188,6 @@ object ProxySuspiciousConnectsModel { select(ClientIP, Word) ipWordDF.rdd.map({ case Row(ip, word) => ((ip.asInstanceOf[String], word.asInstanceOf[String]), 1) }) - .reduceByKey(_ + _).map({ case ((ip, word), count) => OniLDACInput(ip, word, count) }) + .reduceByKey(_ + _).map({ case ((ip, word), count) => SpotLDACInput(ip, word, count) }) } } \ No newline at end of file diff --git a/src/main/scala/org/opennetworkinsight/proxy/ProxyWordCreation.scala b/src/main/scala/org/spot/proxy/ProxyWordCreation.scala similarity index 93% rename from src/main/scala/org/opennetworkinsight/proxy/ProxyWordCreation.scala rename to src/main/scala/org/spot/proxy/ProxyWordCreation.scala index e651c17..086da8b 100644 --- a/src/main/scala/org/opennetworkinsight/proxy/ProxyWordCreation.scala +++ b/src/main/scala/org/spot/proxy/ProxyWordCreation.scala @@ -1,8 +1,8 @@ -package org.opennetworkinsight.proxy +package org.spot.proxy import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.functions._ -import org.opennetworkinsight.utilities.{Entropy, Quantiles, DomainProcessor, TimeUtilities} +import org.spot.utilities.{Entropy, Quantiles, DomainProcessor, TimeUtilities} object ProxyWordCreation { diff --git a/src/main/scala/org/opennetworkinsight/utilities/CountryCodes.scala b/src/main/scala/org/spot/utilities/CountryCodes.scala similarity index 97% rename from src/main/scala/org/opennetworkinsight/utilities/CountryCodes.scala rename to src/main/scala/org/spot/utilities/CountryCodes.scala index 57c01dd..349c880 100644 --- a/src/main/scala/org/opennetworkinsight/utilities/CountryCodes.scala +++ b/src/main/scala/org/spot/utilities/CountryCodes.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.utilities +package org.spot.utilities object CountryCodes { diff --git a/src/main/scala/org/opennetworkinsight/utilities/DataFrameUtils.scala b/src/main/scala/org/spot/utilities/DataFrameUtils.scala similarity index 96% rename from src/main/scala/org/opennetworkinsight/utilities/DataFrameUtils.scala rename to src/main/scala/org/spot/utilities/DataFrameUtils.scala index 8906760..463f3f0 100644 --- a/src/main/scala/org/opennetworkinsight/utilities/DataFrameUtils.scala +++ b/src/main/scala/org/spot/utilities/DataFrameUtils.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.utilities +package org.spot.utilities import org.apache.spark.sql.{DataFrame, Row} diff --git a/src/main/scala/org/opennetworkinsight/utilities/DomainProcessor.scala b/src/main/scala/org/spot/utilities/DomainProcessor.scala similarity index 99% rename from src/main/scala/org/opennetworkinsight/utilities/DomainProcessor.scala rename to src/main/scala/org/spot/utilities/DomainProcessor.scala index df13e03..95c0e23 100644 --- a/src/main/scala/org/opennetworkinsight/utilities/DomainProcessor.scala +++ b/src/main/scala/org/spot/utilities/DomainProcessor.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.utilities +package org.spot.utilities import org.apache.spark.broadcast.Broadcast diff --git a/src/main/scala/org/opennetworkinsight/utilities/Entropy.scala b/src/main/scala/org/spot/utilities/Entropy.scala similarity index 91% rename from src/main/scala/org/opennetworkinsight/utilities/Entropy.scala rename to src/main/scala/org/spot/utilities/Entropy.scala index b1bbe00..c9a2b87 100644 --- a/src/main/scala/org/opennetworkinsight/utilities/Entropy.scala +++ b/src/main/scala/org/spot/utilities/Entropy.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.utilities +package org.spot.utilities import scala.math._ diff --git a/src/main/scala/org/opennetworkinsight/utilities/Quantiles.scala b/src/main/scala/org/spot/utilities/Quantiles.scala similarity index 98% rename from src/main/scala/org/opennetworkinsight/utilities/Quantiles.scala rename to src/main/scala/org/spot/utilities/Quantiles.scala index a0fcdfc..e1603e1 100644 --- a/src/main/scala/org/opennetworkinsight/utilities/Quantiles.scala +++ b/src/main/scala/org/spot/utilities/Quantiles.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.utilities +package org.spot.utilities import org.apache.spark.rdd.RDD diff --git a/src/main/scala/org/opennetworkinsight/utilities/TimeUtilities.scala b/src/main/scala/org/spot/utilities/TimeUtilities.scala similarity index 87% rename from src/main/scala/org/opennetworkinsight/utilities/TimeUtilities.scala rename to src/main/scala/org/spot/utilities/TimeUtilities.scala index 1bbe075..bdf94a5 100644 --- a/src/main/scala/org/opennetworkinsight/utilities/TimeUtilities.scala +++ b/src/main/scala/org/spot/utilities/TimeUtilities.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.utilities +package org.spot.utilities object TimeUtilities { diff --git a/src/main/scala/org/opennetworkinsight/utilities/TopDomains.scala b/src/main/scala/org/spot/utilities/TopDomains.scala similarity index 86% rename from src/main/scala/org/opennetworkinsight/utilities/TopDomains.scala rename to src/main/scala/org/spot/utilities/TopDomains.scala index c1d1b32..f6eea88 100644 --- a/src/main/scala/org/opennetworkinsight/utilities/TopDomains.scala +++ b/src/main/scala/org/spot/utilities/TopDomains.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.utilities +package org.spot.utilities import scala.io.Source diff --git a/src/test/scala/org/opennetworkinsight/DNSWordCreationTest.scala b/src/test/scala/org/spot/DNSWordCreationTest.scala similarity index 56% rename from src/test/scala/org/opennetworkinsight/DNSWordCreationTest.scala rename to src/test/scala/org/spot/DNSWordCreationTest.scala index a42d667..064863a 100644 --- a/src/test/scala/org/opennetworkinsight/DNSWordCreationTest.scala +++ b/src/test/scala/org/spot/DNSWordCreationTest.scala @@ -1,11 +1,11 @@ -package org.opennetworkinsight +package org.spot import javax.swing.text.Utilities -import org.opennetworkinsight.dns.{DNSSuspiciousConnectsAnalysis, DNSWordCreation} -import org.opennetworkinsight.testutils.TestingSparkContextFlatSpec -import org.opennetworkinsight.utilities.{CountryCodes, Entropy, TopDomains} +import org.spot.dns.{DNSSuspiciousConnectsAnalysis, DNSWordCreation} +import org.spot.testutils.TestingSparkContextFlatSpec +import org.spot.utilities.{CountryCodes, Entropy, TopDomains} import org.scalatest.Matchers class DNSWordCreationTest extends TestingSparkContextFlatSpec with Matchers { diff --git a/src/test/scala/org/opennetworkinsight/FlowWordCreationTest.scala b/src/test/scala/org/spot/FlowWordCreationTest.scala similarity index 99% rename from src/test/scala/org/opennetworkinsight/FlowWordCreationTest.scala rename to src/test/scala/org/spot/FlowWordCreationTest.scala index 18d5a28..a8081e9 100644 --- a/src/test/scala/org/opennetworkinsight/FlowWordCreationTest.scala +++ b/src/test/scala/org/spot/FlowWordCreationTest.scala @@ -1,7 +1,7 @@ -package org.opennetworkinsight +package org.spot -import org.opennetworkinsight.netflow.FlowWordCreation +import org.spot.netflow.FlowWordCreation import org.scalatest.{FlatSpec, Matchers} class FlowWordCreationTest extends FlatSpec with Matchers { diff --git a/src/test/scala/org/opennetworkinsight/QuantilesTest.scala b/src/test/scala/org/spot/QuantilesTest.scala similarity index 98% rename from src/test/scala/org/opennetworkinsight/QuantilesTest.scala rename to src/test/scala/org/spot/QuantilesTest.scala index 5e0790c..562f864 100644 --- a/src/test/scala/org/opennetworkinsight/QuantilesTest.scala +++ b/src/test/scala/org/spot/QuantilesTest.scala @@ -1,7 +1,7 @@ -package org.opennetworkinsight +package org.spot import org.apache.spark.rdd.RDD -import org.opennetworkinsight.utilities.Quantiles +import org.spot.utilities.Quantiles import org.scalatest.Matchers import testutils.TestingSparkContextFlatSpec diff --git a/src/test/scala/org/opennetworkinsight/OniLDACWrapperTest.scala b/src/test/scala/org/spot/SpotLDACWrapperTest.scala similarity index 87% rename from src/test/scala/org/opennetworkinsight/OniLDACWrapperTest.scala rename to src/test/scala/org/spot/SpotLDACWrapperTest.scala index c374de8..fb9bd62 100644 --- a/src/test/scala/org/opennetworkinsight/OniLDACWrapperTest.scala +++ b/src/test/scala/org/spot/SpotLDACWrapperTest.scala @@ -1,18 +1,18 @@ -package org.opennetworkinsight +package org.spot -import org.opennetworkinsight.OniLDACWrapper.OniLDACInput -import org.opennetworkinsight.testutils.TestingSparkContextFlatSpec +import org.spot.SpotLDACWrapper.SpotLDACInput +import org.spot.testutils.TestingSparkContextFlatSpec import org.scalatest.Matchers -class OniLDACWrapperTest extends TestingSparkContextFlatSpec with Matchers{ +class SpotLDACWrapperTest extends TestingSparkContextFlatSpec with Matchers{ "normalizeWord" should "calculate exponential of each value in the input string, then sum up all the exponential and " + "then divide each exponential by the total sum" in { val wordProbability = "1 2 3 4 5" - val result = OniLDACWrapper.getWordProbabilitesFromTopicLine(wordProbability) + val result = SpotLDACWrapper.getWordProbabilitesFromTopicLine(wordProbability) result.length shouldBe 5 result(0) shouldBe 0.011656230956039607 @@ -30,7 +30,7 @@ class OniLDACWrapperTest extends TestingSparkContextFlatSpec with Matchers{ "0.0124531442 0.0124531442 0.0124531442 23983.5532262138 0.0124531442 0.0124531442 0.0124531442 0.0124531442 " + "0.0124531442 0.0124531442 22999.4716800747 0.0124531442" - var (docOUT, topicMixOUT) = OniLDACWrapper.getTopicDocument(document, line) + var (docOUT, topicMixOUT) = SpotLDACWrapper.getTopicDocument(document, line) docOUT shouldBe document topicMixOUT shouldBe Array(2.6505498126219955E-7, 2.6505498126219955E-7, 2.6505498126219955E-7, 2.6505498126219955E-7, @@ -44,7 +44,7 @@ class OniLDACWrapperTest extends TestingSparkContextFlatSpec with Matchers{ val document = "192.168.1.1" val line = "0.0 0.0 1.0 -1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0" - val (docOUT, topicMixOUT) = OniLDACWrapper.getTopicDocument(document, line) + val (docOUT, topicMixOUT) = SpotLDACWrapper.getTopicDocument(document, line) docOUT shouldBe document topicMixOUT shouldBe Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) @@ -53,19 +53,19 @@ class OniLDACWrapperTest extends TestingSparkContextFlatSpec with Matchers{ "createModel" should "return model in Array[String] format. Each string should contain the document count and the" + "total count for each word" in { - val documentWordData = sparkContext.parallelize(Array(OniLDACInput("192.168.1.1", "333333_7.0_0.0_1.0", 8), - OniLDACInput("10.10.98.123", "1111111_6.0_3.0_5.0", 4), - OniLDACInput("66.23.45.11", "-1_43_7.0_2.0_6.0", 2), - OniLDACInput("192.168.1.1", "-1_80_6.0_1.0_1.0", 5))) + val documentWordData = sparkContext.parallelize(Array(SpotLDACInput("192.168.1.1", "333333_7.0_0.0_1.0", 8), + SpotLDACInput("10.10.98.123", "1111111_6.0_3.0_5.0", 4), + SpotLDACInput("66.23.45.11", "-1_43_7.0_2.0_6.0", 2), + SpotLDACInput("192.168.1.1", "-1_80_6.0_1.0_1.0", 5))) val wordDictionary = Map("333333_7.0_0.0_1.0" -> 0, "1111111_6.0_3.0_5.0" -> 1, "-1_43_7.0_2.0_6.0" -> 2, "-1_80_6.0_1.0_1.0" -> 3) - val distinctDocument = documentWordData.map({case OniLDACInput(doc, word, count) => doc}).distinct.collect() + val distinctDocument = documentWordData.map({case SpotLDACInput(doc, word, count) => doc}).distinct.collect() - val model = OniLDACWrapper.createModel(documentWordData, wordDictionary, distinctDocument) + val model = SpotLDACWrapper.createModel(documentWordData, wordDictionary, distinctDocument) model should contain ("2 0:8 3:5") model should contain ("1 1:4") @@ -92,7 +92,7 @@ class OniLDACWrapperTest extends TestingSparkContextFlatSpec with Matchers{ "0.0124531442 0.0124531442 0.0124531442 0.0124531442 0.0124531442 0.0124531442 0.0124531442 12.0124531442 " + "0.0124531442 0.0124531442 0.0124531442 0.0124531442") - val results = OniLDACWrapper.getDocumentResults(topicDocumentData, documentDictionary) + val results = SpotLDACWrapper.getDocumentResults(topicDocumentData, documentDictionary) results("66.23.45.11") shouldBe Array(2.6505498126219955E-7, 2.6505498126219955E-7, 2.6505498126219955E-7, 2.6505498126219955E-7, 2.6505498126219955E-7, 2.6505498126219955E-7, 2.6505498126219955E-7, 2.6505498126219955E-7, 2.6505498126219955E-7, @@ -140,7 +140,7 @@ class OniLDACWrapperTest extends TestingSparkContextFlatSpec with Matchers{ "-535.7333037656 -532.2623144682 -528.2020876890 -532.0482294927", "-18.4350359818 -534.4612736041 -530.4010468817 -11.0784977885") - val results = OniLDACWrapper.getWordToProbPerTopicMap(topicWordData, wordDictionary) + val results = SpotLDACWrapper.getWordToProbPerTopicMap(topicWordData, wordDictionary) results.keySet.size shouldBe 4 results("23.0_7.0_7.0_4.0").length shouldBe 20 diff --git a/src/test/scala/org/opennetworkinsight/testutils/TestingSparkContext.scala b/src/test/scala/org/spot/testutils/TestingSparkContext.scala similarity index 98% rename from src/test/scala/org/opennetworkinsight/testutils/TestingSparkContext.scala rename to src/test/scala/org/spot/testutils/TestingSparkContext.scala index 4d75a2d..1c50c91 100644 --- a/src/test/scala/org/opennetworkinsight/testutils/TestingSparkContext.scala +++ b/src/test/scala/org/spot/testutils/TestingSparkContext.scala @@ -1,4 +1,4 @@ -package org.opennetworkinsight.testutils +package org.spot.testutils import java.util.Date diff --git a/src/test/scala/org/opennetworkinsight/testutils/TestingSparkContextFlatSpec.scala b/src/test/scala/org/spot/testutils/TestingSparkContextFlatSpec.scala similarity index 94% rename from src/test/scala/org/opennetworkinsight/testutils/TestingSparkContextFlatSpec.scala rename to src/test/scala/org/spot/testutils/TestingSparkContextFlatSpec.scala index 3bc579b..fa0dd94 100644 --- a/src/test/scala/org/opennetworkinsight/testutils/TestingSparkContextFlatSpec.scala +++ b/src/test/scala/org/spot/testutils/TestingSparkContextFlatSpec.scala @@ -2,7 +2,7 @@ * THIS CODE WAS COPIED DIRECTLY FROM THE OPEN SOURCE PROJECT TAP (Trusted Analytics Platform) * which has an Apache V2.0 */ -package org.opennetworkinsight.testutils +package org.spot.testutils import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext diff --git a/src/test/scala/org/opennetworkinsight/testutils/TestingSparkContextFunSuite.scala b/src/test/scala/org/spot/testutils/TestingSparkContextFunSuite.scala similarity index 94% rename from src/test/scala/org/opennetworkinsight/testutils/TestingSparkContextFunSuite.scala rename to src/test/scala/org/spot/testutils/TestingSparkContextFunSuite.scala index 6ad0c14..49b3f05 100644 --- a/src/test/scala/org/opennetworkinsight/testutils/TestingSparkContextFunSuite.scala +++ b/src/test/scala/org/spot/testutils/TestingSparkContextFunSuite.scala @@ -3,7 +3,7 @@ * which has an Apache V2.0 */ -package org.opennetworkinsight.testutils +package org.spot.testutils import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext diff --git a/src/test/scala/org/opennetworkinsight/testutils/TestingSparkContextWordSpec.scala b/src/test/scala/org/spot/testutils/TestingSparkContextWordSpec.scala similarity index 94% rename from src/test/scala/org/opennetworkinsight/testutils/TestingSparkContextWordSpec.scala rename to src/test/scala/org/spot/testutils/TestingSparkContextWordSpec.scala index 9d84d1b..5169db3 100644 --- a/src/test/scala/org/opennetworkinsight/testutils/TestingSparkContextWordSpec.scala +++ b/src/test/scala/org/spot/testutils/TestingSparkContextWordSpec.scala @@ -3,7 +3,7 @@ * which has an Apache V2.0 */ -package org.opennetworkinsight.testutils +package org.spot.testutils import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext diff --git a/src/test/scala/org/opennetworkinsight/utilities/DomainProcessorTest.scala b/src/test/scala/org/spot/utilities/DomainProcessorTest.scala similarity index 96% rename from src/test/scala/org/opennetworkinsight/utilities/DomainProcessorTest.scala rename to src/test/scala/org/spot/utilities/DomainProcessorTest.scala index ffedee0..6de4e3e 100644 --- a/src/test/scala/org/opennetworkinsight/utilities/DomainProcessorTest.scala +++ b/src/test/scala/org/spot/utilities/DomainProcessorTest.scala @@ -1,8 +1,8 @@ -package org.opennetworkinsight.utilities +package org.spot.utilities -import org.opennetworkinsight.testutils.TestingSparkContextFlatSpec +import org.spot.testutils.TestingSparkContextFlatSpec import org.scalatest.{FunSuite, Matchers} -import org.opennetworkinsight.utilities.DomainProcessor._ +import org.spot.utilities.DomainProcessor._ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers { From 070110da0ec2718b2d7bc0121ff848a20f82f62e Mon Sep 17 00:00:00 2001 From: nlsegerl Date: Wed, 28 Sep 2016 09:51:47 -0700 Subject: [PATCH 2/6] spot_naming Open Network Insight => Apache Spot (incubating) in the readme and install files. --- INSTALL.md | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 0b7d5e5..810dedc 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,6 +1,6 @@ # spot-ml -Machine learning routines for OpenNetworkInsight, version 1.1 +Machine learning routines for Apache Spot (incubating). ## Prerequisites, Installation and Configuration diff --git a/README.md b/README.md index f3b365f..a4eab66 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # spot-ml -Machine learning routines for OpenNetworkInsight, version 1.1 +Machine learning routines for Apache Spot (incubating). At present, spot-ml contains routines for performing *suspicious connections* analyses on netflow, DNS or proxy data gathered from a network. These analyses consume a (possibly very lage) collection of network events and produces a list of the events that considered to be the least probable (or most suspicious). From f7d5f24ab37dc1f9d05a9cac809436b31684a350 Mon Sep 17 00:00:00 2001 From: nlsegerl Date: Thu, 29 Sep 2016 11:24:16 -0700 Subject: [PATCH 3/6] spot_naming update of ml_ops.sh finally added the util script upload.sh to git --- ml_ops.sh | 2 +- upload.sh | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100755 upload.sh diff --git a/ml_ops.sh b/ml_ops.sh index d60cd2c..f87d33e 100755 --- a/ml_ops.sh +++ b/ml_ops.sh @@ -84,7 +84,7 @@ hdfs dfs -rm -R -f ${HDFS_SCORED_CONNECTS} ${MPI_PREP_CMD} -time spark-submit --class "org.opennetworkinsight.SuspiciousConnects" \ +time spark-submit --class "org.spot.SuspiciousConnects" \ --master yarn-client \ --driver-memory ${SPK_DRIVER_MEM} \ --conf spark.driver.maxResultSize=${SPK_DRIVER_MAX_RESULTS} \ diff --git a/upload.sh b/upload.sh new file mode 100755 index 0000000..72d29fb --- /dev/null +++ b/upload.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +USER=$1 +CLUSTER=$2 +DIRNAME=$3 + +# any optional arguments you wish to pass directly to rsync go in position 4 + +# exclude the hidden files, target, lib, project, src... except the assembled jar +rsync $4 -v -a --include='target' --include='target/scala-2.10' --include='target/scala-2.10/*.jar' \ + --include='spot-lda-c' --include='spot-lda-c/*' --include='*.py' --include='*.sh' --include 'top-1m.csv' \ + --exclude='*' . $USER@$CLUSTER:~/$DIRNAME From 707bdc224c71158a7c76cf4628dae5cfd09a514e Mon Sep 17 00:00:00 2001 From: nlsegerl Date: Thu, 29 Sep 2016 11:40:13 -0700 Subject: [PATCH 4/6] spot_naming replacing blown away .gitmodules file --- .gitmodules | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitmodules diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..d9cfe6a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "oni-lda-c"] + path = oni-lda-c + url = https://github.com/OpenNetworkInsight/oni-lda-c From f26816b0ff467427f4671d572409f89be99b16d6 Mon Sep 17 00:00:00 2001 From: nlsegerl Date: Thu, 29 Sep 2016 13:51:07 -0700 Subject: [PATCH 5/6] spot_naming org.spot => org.apache.spot Oh Yeah.... --- ml_ops.sh | 2 +- .../{ => apache}/spot/SpotLDACWrapper.scala | 2 +- .../{ => apache}/spot/SuspiciousConnects.scala | 10 +++++----- .../SuspiciousConnectsArgumentParser.scala | 2 +- .../spot/SuspiciousConnectsScoreFunction.scala | 2 +- .../org/{ => apache}/spot/dns/DNSSchema.scala | 4 ++-- .../dns/DNSSuspiciousConnectsAnalysis.scala | 14 +++++++------- .../spot/dns/DNSWordCreation.scala | 6 +++--- .../spot/dns/model/DNSFeedback.scala | 4 ++-- .../spot/dns/model/DNSScoreFunction.scala | 6 +++--- .../dns/model/DNSSuspiciousConnectsModel.scala | 18 +++++++++--------- .../sideinformation/DNSSideInformation.scala | 12 ++++++------ .../DNSSideInformationFunction.scala | 6 +++--- .../spot/netflow/FlowColumnIndex.scala | 2 +- .../spot/netflow/FlowPostLDA.scala | 4 ++-- .../{ => apache}/spot/netflow/FlowPreLDA.scala | 6 +++--- .../{ => apache}/spot/netflow/FlowSchema.scala | 2 +- .../spot/netflow/FlowSuspiciousConnects.scala | 8 ++++---- .../spot/netflow/FlowWordCreation.scala | 6 +++--- .../spot/proxy/ProxyFeedback.scala | 4 ++-- .../{ => apache}/spot/proxy/ProxySchema.scala | 2 +- .../ProxySuspiciousConnectsAnalysis.scala | 8 ++++---- .../proxy/ProxySuspiciousConnectsModel.scala | 12 ++++++------ .../spot/proxy/ProxyWordCreation.scala | 4 ++-- .../spot/utilities/CountryCodes.scala | 2 +- .../spot/utilities/DataFrameUtils.scala | 2 +- .../spot/utilities/DomainProcessor.scala | 2 +- .../{ => apache}/spot/utilities/Entropy.scala | 2 +- .../spot/utilities/Quantiles.scala | 2 +- .../spot/utilities/TimeUtilities.scala | 2 +- .../spot/utilities/TopDomains.scala | 2 +- .../spot/DNSWordCreationTest.scala | 8 ++++---- .../spot/FlowWordCreationTest.scala | 4 ++-- .../org/{ => apache}/spot/QuantilesTest.scala | 4 ++-- .../spot/SpotLDACWrapperTest.scala | 6 +++--- .../spot/testutils/TestingSparkContext.scala | 2 +- .../TestingSparkContextFlatSpec.scala | 2 +- .../TestingSparkContextFunSuite.scala | 2 +- .../TestingSparkContextWordSpec.scala | 2 +- .../spot/utilities/DomainProcessorTest.scala | 6 +++--- 40 files changed, 98 insertions(+), 98 deletions(-) rename src/main/scala/org/{ => apache}/spot/SpotLDACWrapper.scala (99%) rename src/main/scala/org/{ => apache}/spot/SuspiciousConnects.scala (86%) rename src/main/scala/org/{ => apache}/spot/SuspiciousConnectsArgumentParser.scala (99%) rename src/main/scala/org/{ => apache}/spot/SuspiciousConnectsScoreFunction.scala (96%) rename src/main/scala/org/{ => apache}/spot/dns/DNSSchema.scala (95%) rename src/main/scala/org/{ => apache}/spot/dns/DNSSuspiciousConnectsAnalysis.scala (86%) rename src/main/scala/org/{ => apache}/spot/dns/DNSWordCreation.scala (95%) rename src/main/scala/org/{ => apache}/spot/dns/model/DNSFeedback.scala (96%) rename src/main/scala/org/{ => apache}/spot/dns/model/DNSScoreFunction.scala (92%) rename src/main/scala/org/{ => apache}/spot/dns/model/DNSSuspiciousConnectsModel.scala (94%) rename src/main/scala/org/{ => apache}/spot/dns/sideinformation/DNSSideInformation.scala (87%) rename src/main/scala/org/{ => apache}/spot/dns/sideinformation/DNSSideInformationFunction.scala (93%) rename src/main/scala/org/{ => apache}/spot/netflow/FlowColumnIndex.scala (92%) rename src/main/scala/org/{ => apache}/spot/netflow/FlowPostLDA.scala (98%) rename src/main/scala/org/{ => apache}/spot/netflow/FlowPreLDA.scala (97%) rename src/main/scala/org/{ => apache}/spot/netflow/FlowSchema.scala (97%) rename src/main/scala/org/{ => apache}/spot/netflow/FlowSuspiciousConnects.scala (84%) rename src/main/scala/org/{ => apache}/spot/netflow/FlowWordCreation.scala (97%) rename src/main/scala/org/{ => apache}/spot/proxy/ProxyFeedback.scala (97%) rename src/main/scala/org/{ => apache}/spot/proxy/ProxySchema.scala (97%) rename src/main/scala/org/{ => apache}/spot/proxy/ProxySuspiciousConnectsAnalysis.scala (90%) rename src/main/scala/org/{ => apache}/spot/proxy/ProxySuspiciousConnectsModel.scala (95%) rename src/main/scala/org/{ => apache}/spot/proxy/ProxyWordCreation.scala (94%) rename src/main/scala/org/{ => apache}/spot/utilities/CountryCodes.scala (97%) rename src/main/scala/org/{ => apache}/spot/utilities/DataFrameUtils.scala (96%) rename src/main/scala/org/{ => apache}/spot/utilities/DomainProcessor.scala (99%) rename src/main/scala/org/{ => apache}/spot/utilities/Entropy.scala (93%) rename src/main/scala/org/{ => apache}/spot/utilities/Quantiles.scala (99%) rename src/main/scala/org/{ => apache}/spot/utilities/TimeUtilities.scala (89%) rename src/main/scala/org/{ => apache}/spot/utilities/TopDomains.scala (88%) rename src/test/scala/org/{ => apache}/spot/DNSWordCreationTest.scala (59%) rename src/test/scala/org/{ => apache}/spot/FlowWordCreationTest.scala (99%) rename src/test/scala/org/{ => apache}/spot/QuantilesTest.scala (98%) rename src/test/scala/org/{ => apache}/spot/SpotLDACWrapperTest.scala (98%) rename src/test/scala/org/{ => apache}/spot/testutils/TestingSparkContext.scala (98%) rename src/test/scala/org/{ => apache}/spot/testutils/TestingSparkContextFlatSpec.scala (95%) rename src/test/scala/org/{ => apache}/spot/testutils/TestingSparkContextFunSuite.scala (95%) rename src/test/scala/org/{ => apache}/spot/testutils/TestingSparkContextWordSpec.scala (95%) rename src/test/scala/org/{ => apache}/spot/utilities/DomainProcessorTest.scala (96%) diff --git a/ml_ops.sh b/ml_ops.sh index f87d33e..1b24827 100755 --- a/ml_ops.sh +++ b/ml_ops.sh @@ -84,7 +84,7 @@ hdfs dfs -rm -R -f ${HDFS_SCORED_CONNECTS} ${MPI_PREP_CMD} -time spark-submit --class "org.spot.SuspiciousConnects" \ +time spark-submit --class "org.apache.spot.SuspiciousConnects" \ --master yarn-client \ --driver-memory ${SPK_DRIVER_MEM} \ --conf spark.driver.maxResultSize=${SPK_DRIVER_MAX_RESULTS} \ diff --git a/src/main/scala/org/spot/SpotLDACWrapper.scala b/src/main/scala/org/apache/spot/SpotLDACWrapper.scala similarity index 99% rename from src/main/scala/org/spot/SpotLDACWrapper.scala rename to src/main/scala/org/apache/spot/SpotLDACWrapper.scala index 8a7f405..22bca46 100644 --- a/src/main/scala/org/spot/SpotLDACWrapper.scala +++ b/src/main/scala/org/apache/spot/SpotLDACWrapper.scala @@ -1,4 +1,4 @@ -package org.spot +package org.apache.spot import org.apache.spark.rdd.RDD import java.io.PrintWriter diff --git a/src/main/scala/org/spot/SuspiciousConnects.scala b/src/main/scala/org/apache/spot/SuspiciousConnects.scala similarity index 86% rename from src/main/scala/org/spot/SuspiciousConnects.scala rename to src/main/scala/org/apache/spot/SuspiciousConnects.scala index d948f62..425196c 100644 --- a/src/main/scala/org/spot/SuspiciousConnects.scala +++ b/src/main/scala/org/apache/spot/SuspiciousConnects.scala @@ -1,12 +1,12 @@ -package org.spot +package org.apache.spot import org.apache.log4j.{Level, LogManager, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} -import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.spot.dns.DNSSuspiciousConnectsAnalysis -import org.spot.netflow.FlowSuspiciousConnects -import org.spot.proxy.ProxySuspiciousConnectsAnalysis +import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.apache.spot.dns.DNSSuspiciousConnectsAnalysis +import org.apache.spot.netflow.FlowSuspiciousConnects +import org.apache.spot.proxy.ProxySuspiciousConnectsAnalysis /** * Top level entrypoint to execute suspicious connections analysis on network data. diff --git a/src/main/scala/org/spot/SuspiciousConnectsArgumentParser.scala b/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala similarity index 99% rename from src/main/scala/org/spot/SuspiciousConnectsArgumentParser.scala rename to src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala index f8ffeaa..eb7ffe8 100644 --- a/src/main/scala/org/spot/SuspiciousConnectsArgumentParser.scala +++ b/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala @@ -1,4 +1,4 @@ -package org.spot +package org.apache.spot /** diff --git a/src/main/scala/org/spot/SuspiciousConnectsScoreFunction.scala b/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala similarity index 96% rename from src/main/scala/org/spot/SuspiciousConnectsScoreFunction.scala rename to src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala index 549fee6..7bafebb 100644 --- a/src/main/scala/org/spot/SuspiciousConnectsScoreFunction.scala +++ b/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala @@ -1,4 +1,4 @@ -package org.spot +package org.apache.spot import org.apache.spark.broadcast.Broadcast diff --git a/src/main/scala/org/spot/dns/DNSSchema.scala b/src/main/scala/org/apache/spot/dns/DNSSchema.scala similarity index 95% rename from src/main/scala/org/spot/dns/DNSSchema.scala rename to src/main/scala/org/apache/spot/dns/DNSSchema.scala index 97b9431..fd8f33e 100644 --- a/src/main/scala/org/spot/dns/DNSSchema.scala +++ b/src/main/scala/org/apache/spot/dns/DNSSchema.scala @@ -1,8 +1,8 @@ -package org.spot.dns +package org.apache.spot.dns import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ -import org.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema +import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema /** * Data frame schemas and column names used in the DNS suspicious connects analysis. diff --git a/src/main/scala/org/spot/dns/DNSSuspiciousConnectsAnalysis.scala b/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala similarity index 86% rename from src/main/scala/org/spot/dns/DNSSuspiciousConnectsAnalysis.scala rename to src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala index e08c5c7..b987a37 100644 --- a/src/main/scala/org/spot/dns/DNSSuspiciousConnectsAnalysis.scala +++ b/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala @@ -1,17 +1,17 @@ -package org.spot.dns +package org.apache.spot.dns import org.apache.spark.SparkContext import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, SQLContext} -import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.spot.dns.DNSSchema._ -import org.spot.dns.model.DNSSuspiciousConnectsModel -import org.spot.dns.sideinformation.DNSSideInformation +import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.apache.spot.dns.DNSSchema._ +import org.apache.spot.dns.model.DNSSuspiciousConnectsModel +import org.apache.spot.dns.sideinformation.DNSSideInformation import org.apache.log4j.Logger -import org.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema -import org.spot.dns.sideinformation.DNSSideInformation.SideInfoSchema +import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema +import org.apache.spot.dns.sideinformation.DNSSideInformation.SideInfoSchema /** * The suspicious connections analysis of DNS log data develops a probabilistic model the DNS queries diff --git a/src/main/scala/org/spot/dns/DNSWordCreation.scala b/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala similarity index 95% rename from src/main/scala/org/spot/dns/DNSWordCreation.scala rename to src/main/scala/org/apache/spot/dns/DNSWordCreation.scala index 544b31f..2ffe12e 100644 --- a/src/main/scala/org/spot/dns/DNSWordCreation.scala +++ b/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala @@ -1,9 +1,9 @@ -package org.spot.dns +package org.apache.spot.dns import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.functions._ -import org.spot.utilities.DomainProcessor.{DomainInfo, extractDomainInfo} -import org.spot.utilities.Quantiles +import org.apache.spot.utilities.DomainProcessor.{DomainInfo, extractDomainInfo} +import org.apache.spot.utilities.Quantiles /** diff --git a/src/main/scala/org/spot/dns/model/DNSFeedback.scala b/src/main/scala/org/apache/spot/dns/model/DNSFeedback.scala similarity index 96% rename from src/main/scala/org/spot/dns/model/DNSFeedback.scala rename to src/main/scala/org/apache/spot/dns/model/DNSFeedback.scala index 9048e97..7aab884 100644 --- a/src/main/scala/org/spot/dns/model/DNSFeedback.scala +++ b/src/main/scala/org/apache/spot/dns/model/DNSFeedback.scala @@ -1,9 +1,9 @@ -package org.spot.dns.model +package org.apache.spot.dns.model import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.spot.dns.model.DNSSuspiciousConnectsModel.{ModelSchema, modelColumns} +import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.{ModelSchema, modelColumns} import scala.io.Source diff --git a/src/main/scala/org/spot/dns/model/DNSScoreFunction.scala b/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala similarity index 92% rename from src/main/scala/org/spot/dns/model/DNSScoreFunction.scala rename to src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala index 3129e8a..09656f2 100644 --- a/src/main/scala/org/spot/dns/model/DNSScoreFunction.scala +++ b/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala @@ -1,8 +1,8 @@ -package org.spot.dns.model +package org.apache.spot.dns.model import org.apache.spark.broadcast.Broadcast -import org.spot.SuspiciousConnectsScoreFunction -import org.spot.dns.DNSWordCreation +import org.apache.spot.SuspiciousConnectsScoreFunction +import org.apache.spot.dns.DNSWordCreation /** diff --git a/src/main/scala/org/spot/dns/model/DNSSuspiciousConnectsModel.scala b/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala similarity index 94% rename from src/main/scala/org/spot/dns/model/DNSSuspiciousConnectsModel.scala rename to src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala index d8c289b..d8580c3 100644 --- a/src/main/scala/org/spot/dns/model/DNSSuspiciousConnectsModel.scala +++ b/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala @@ -1,4 +1,4 @@ -package org.spot.dns.model +package org.apache.spot.dns.model import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast @@ -6,13 +6,13 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.spot.SpotLDACWrapper -import org.spot.SpotLDACWrapper.{SpotLDACInput, SpotLDACOutput} -import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.spot.dns.DNSSchema._ -import org.spot.dns.DNSWordCreation -import org.spot.utilities.{CountryCodes, DomainProcessor, Quantiles, TopDomains} -import org.spot.utilities.DomainProcessor.DomainInfo +import org.apache.spot.SpotLDACWrapper +import org.apache.spot.SpotLDACWrapper.{SpotLDACInput, SpotLDACOutput} +import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.apache.spot.dns.DNSSchema._ +import org.apache.spot.dns.DNSWordCreation +import org.apache.spot.utilities.{CountryCodes, DomainProcessor, Quantiles, TopDomains} +import org.apache.spot.utilities.DomainProcessor.DomainInfo import org.apache.log4j.Logger @@ -64,7 +64,7 @@ class DNSSuspiciousConnectsModel(inTopicCount: Int, * @param sc Spark Context * @param sqlContext Spark SQL context * @param inDF Dataframe of DNS log events, containing at least the columns of [[DNSSuspiciousConnectsModel.ModelSchema]] - * @return Dataframe with a column named [[org.spot.dns.DNSSchema.Score]] that contains the + * @return Dataframe with a column named [[org.apache.spot.dns.DNSSchema.Score]] that contains the * probability estimated for the network event at that row */ def score(sc: SparkContext, sqlContext: SQLContext, inDF: DataFrame): DataFrame = { diff --git a/src/main/scala/org/spot/dns/sideinformation/DNSSideInformation.scala b/src/main/scala/org/apache/spot/dns/sideinformation/DNSSideInformation.scala similarity index 87% rename from src/main/scala/org/spot/dns/sideinformation/DNSSideInformation.scala rename to src/main/scala/org/apache/spot/dns/sideinformation/DNSSideInformation.scala index 1d4f49e..28e95b3 100644 --- a/src/main/scala/org/spot/dns/sideinformation/DNSSideInformation.scala +++ b/src/main/scala/org/apache/spot/dns/sideinformation/DNSSideInformation.scala @@ -1,19 +1,19 @@ -package org.spot.dns.sideinformation +package org.apache.spot.dns.sideinformation import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.spot.dns.DNSSchema._ -import org.spot.dns.model.DNSSuspiciousConnectsModel -import org.spot.utilities.{CountryCodes, TopDomains} +import org.apache.spot.dns.DNSSchema._ +import org.apache.spot.dns.model.DNSSuspiciousConnectsModel +import org.apache.spot.utilities.{CountryCodes, TopDomains} /** * Create the side information required by the OA layer. * - * @param model An instance of [[org.spot.dns.model.DNSSuspiciousConnectsModel]] + * @param model An instance of [[org.apache.spot.dns.model.DNSSuspiciousConnectsModel]] * Necessary because much of the side information is auxilliary information used to * construct the model. */ @@ -26,7 +26,7 @@ class DNSSideInformation(model: DNSSuspiciousConnectsModel) { * @param sparkContext Spark context. * @param sqlContext Spark SQL context. * @param inDF dataframe containing at least the rows of - * [[org.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema]] + * [[org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema]] * @return copy of the dataframe with the columsns [[DNSSideInformation.SideInfoSchema]] added */ def addSideInformationForOA(sparkContext: SparkContext, diff --git a/src/main/scala/org/spot/dns/sideinformation/DNSSideInformationFunction.scala b/src/main/scala/org/apache/spot/dns/sideinformation/DNSSideInformationFunction.scala similarity index 93% rename from src/main/scala/org/spot/dns/sideinformation/DNSSideInformationFunction.scala rename to src/main/scala/org/apache/spot/dns/sideinformation/DNSSideInformationFunction.scala index 44d58ff..2c0912b 100644 --- a/src/main/scala/org/spot/dns/sideinformation/DNSSideInformationFunction.scala +++ b/src/main/scala/org/apache/spot/dns/sideinformation/DNSSideInformationFunction.scala @@ -1,9 +1,9 @@ -package org.spot.dns.sideinformation +package org.apache.spot.dns.sideinformation import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.Row -import org.spot.dns.DNSWordCreation -import org.spot.utilities.DomainProcessor.{DomainInfo, extractDomainInfo} +import org.apache.spot.dns.DNSWordCreation +import org.apache.spot.utilities.DomainProcessor.{DomainInfo, extractDomainInfo} /** * Add side information for OA to a dataframe. diff --git a/src/main/scala/org/spot/netflow/FlowColumnIndex.scala b/src/main/scala/org/apache/spot/netflow/FlowColumnIndex.scala similarity index 92% rename from src/main/scala/org/spot/netflow/FlowColumnIndex.scala rename to src/main/scala/org/apache/spot/netflow/FlowColumnIndex.scala index a3aede2..72a517a 100644 --- a/src/main/scala/org/spot/netflow/FlowColumnIndex.scala +++ b/src/main/scala/org/apache/spot/netflow/FlowColumnIndex.scala @@ -1,4 +1,4 @@ -package org.spot.netflow +package org.apache.spot.netflow object FlowColumnIndex extends Enumeration { val HOUR = 4 diff --git a/src/main/scala/org/spot/netflow/FlowPostLDA.scala b/src/main/scala/org/apache/spot/netflow/FlowPostLDA.scala similarity index 98% rename from src/main/scala/org/spot/netflow/FlowPostLDA.scala rename to src/main/scala/org/apache/spot/netflow/FlowPostLDA.scala index 06b5fc1..42f5767 100644 --- a/src/main/scala/org/spot/netflow/FlowPostLDA.scala +++ b/src/main/scala/org/apache/spot/netflow/FlowPostLDA.scala @@ -1,4 +1,4 @@ -package org.spot.netflow +package org.apache.spot.netflow import org.apache.log4j.Logger import org.apache.spark.SparkContext @@ -6,7 +6,7 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.spot.netflow.FlowSchema._ +import org.apache.spot.netflow.FlowSchema._ /** * Contains routines for scoring incoming netflow records from a netflow suspicious connections model. diff --git a/src/main/scala/org/spot/netflow/FlowPreLDA.scala b/src/main/scala/org/apache/spot/netflow/FlowPreLDA.scala similarity index 97% rename from src/main/scala/org/spot/netflow/FlowPreLDA.scala rename to src/main/scala/org/apache/spot/netflow/FlowPreLDA.scala index ffdc907..b789818 100755 --- a/src/main/scala/org/spot/netflow/FlowPreLDA.scala +++ b/src/main/scala/org/apache/spot/netflow/FlowPreLDA.scala @@ -1,12 +1,12 @@ -package org.spot.netflow +package org.apache.spot.netflow import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.spot.SpotLDACWrapper.SpotLDACInput -import org.spot.netflow.FlowSchema._ +import org.apache.spot.SpotLDACWrapper.SpotLDACInput +import org.apache.spot.netflow.FlowSchema._ import scala.io.Source diff --git a/src/main/scala/org/spot/netflow/FlowSchema.scala b/src/main/scala/org/apache/spot/netflow/FlowSchema.scala similarity index 97% rename from src/main/scala/org/spot/netflow/FlowSchema.scala rename to src/main/scala/org/apache/spot/netflow/FlowSchema.scala index 67be4a9..176c74a 100644 --- a/src/main/scala/org/spot/netflow/FlowSchema.scala +++ b/src/main/scala/org/apache/spot/netflow/FlowSchema.scala @@ -1,4 +1,4 @@ -package org.spot.netflow +package org.apache.spot.netflow object FlowSchema { diff --git a/src/main/scala/org/spot/netflow/FlowSuspiciousConnects.scala b/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnects.scala similarity index 84% rename from src/main/scala/org/spot/netflow/FlowSuspiciousConnects.scala rename to src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnects.scala index 132768f..fd6a34a 100644 --- a/src/main/scala/org/spot/netflow/FlowSuspiciousConnects.scala +++ b/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnects.scala @@ -1,11 +1,11 @@ -package org.spot.netflow +package org.apache.spot.netflow import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext -import org.spot.SpotLDACWrapper -import org.spot.SpotLDACWrapper.SpotLDACOutput -import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.apache.spot.SpotLDACWrapper +import org.apache.spot.SpotLDACWrapper.SpotLDACOutput +import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig object FlowSuspiciousConnects { diff --git a/src/main/scala/org/spot/netflow/FlowWordCreation.scala b/src/main/scala/org/apache/spot/netflow/FlowWordCreation.scala similarity index 97% rename from src/main/scala/org/spot/netflow/FlowWordCreation.scala rename to src/main/scala/org/apache/spot/netflow/FlowWordCreation.scala index b37a817..122127f 100644 --- a/src/main/scala/org/spot/netflow/FlowWordCreation.scala +++ b/src/main/scala/org/apache/spot/netflow/FlowWordCreation.scala @@ -1,4 +1,4 @@ -package org.spot.netflow +package org.apache.spot.netflow import org.apache.log4j.Logger import org.apache.spark.SparkContext @@ -6,8 +6,8 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.spot.netflow.{FlowSchema => Schema} -import org.spot.utilities.Quantiles +import org.apache.spot.netflow.{FlowSchema => Schema} +import org.apache.spot.utilities.Quantiles object FlowWordCreation { diff --git a/src/main/scala/org/spot/proxy/ProxyFeedback.scala b/src/main/scala/org/apache/spot/proxy/ProxyFeedback.scala similarity index 97% rename from src/main/scala/org/spot/proxy/ProxyFeedback.scala rename to src/main/scala/org/apache/spot/proxy/ProxyFeedback.scala index 0f2eede..4d24187 100644 --- a/src/main/scala/org/spot/proxy/ProxyFeedback.scala +++ b/src/main/scala/org/apache/spot/proxy/ProxyFeedback.scala @@ -1,4 +1,4 @@ -package org.spot.proxy +package org.apache.spot.proxy import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD @@ -6,7 +6,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.types.{StructType, StructField, StringType} import scala.io.Source -import org.spot.proxy.ProxySchema._ +import org.apache.spot.proxy.ProxySchema._ object ProxyFeedback { diff --git a/src/main/scala/org/spot/proxy/ProxySchema.scala b/src/main/scala/org/apache/spot/proxy/ProxySchema.scala similarity index 97% rename from src/main/scala/org/spot/proxy/ProxySchema.scala rename to src/main/scala/org/apache/spot/proxy/ProxySchema.scala index f7a6716..28d1dac 100644 --- a/src/main/scala/org/spot/proxy/ProxySchema.scala +++ b/src/main/scala/org/apache/spot/proxy/ProxySchema.scala @@ -1,4 +1,4 @@ -package org.spot.proxy +package org.apache.spot.proxy /** * Data frame column names used in the proxy suspicious connects analysis. diff --git a/src/main/scala/org/spot/proxy/ProxySuspiciousConnectsAnalysis.scala b/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala similarity index 90% rename from src/main/scala/org/spot/proxy/ProxySuspiciousConnectsAnalysis.scala rename to src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala index ebc0032..5212434 100644 --- a/src/main/scala/org/spot/proxy/ProxySuspiciousConnectsAnalysis.scala +++ b/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala @@ -1,11 +1,11 @@ -package org.spot.proxy +package org.apache.spot.proxy import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext -import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.spot.proxy.ProxySchema._ -import org.spot.utilities.DataFrameUtils +import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.apache.spot.proxy.ProxySchema._ +import org.apache.spot.utilities.DataFrameUtils /** * Run suspicious connections analysis on proxy data. diff --git a/src/main/scala/org/spot/proxy/ProxySuspiciousConnectsModel.scala b/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala similarity index 95% rename from src/main/scala/org/spot/proxy/ProxySuspiciousConnectsModel.scala rename to src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala index 5be09de..7ce522d 100644 --- a/src/main/scala/org/spot/proxy/ProxySuspiciousConnectsModel.scala +++ b/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala @@ -1,4 +1,4 @@ -package org.spot.proxy +package org.apache.spot.proxy import org.apache.log4j.Logger import org.apache.spark.SparkContext @@ -6,11 +6,11 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.spot.SpotLDACWrapper.{SpotLDACInput, SpotLDACOutput} -import org.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig -import org.spot.proxy.ProxySchema._ -import org.spot.utilities._ -import org.spot.{SpotLDACWrapper, SuspiciousConnectsScoreFunction} +import org.apache.spot.SpotLDACWrapper.{SpotLDACInput, SpotLDACOutput} +import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig +import org.apache.spot.proxy.ProxySchema._ +import org.apache.spot.utilities._ +import org.apache.spot.{SpotLDACWrapper, SuspiciousConnectsScoreFunction} /** * Encapsulation of a proxy suspicious connections model. diff --git a/src/main/scala/org/spot/proxy/ProxyWordCreation.scala b/src/main/scala/org/apache/spot/proxy/ProxyWordCreation.scala similarity index 94% rename from src/main/scala/org/spot/proxy/ProxyWordCreation.scala rename to src/main/scala/org/apache/spot/proxy/ProxyWordCreation.scala index 086da8b..8407390 100644 --- a/src/main/scala/org/spot/proxy/ProxyWordCreation.scala +++ b/src/main/scala/org/apache/spot/proxy/ProxyWordCreation.scala @@ -1,8 +1,8 @@ -package org.spot.proxy +package org.apache.spot.proxy import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.functions._ -import org.spot.utilities.{Entropy, Quantiles, DomainProcessor, TimeUtilities} +import org.apache.spot.utilities.{Entropy, Quantiles, DomainProcessor, TimeUtilities} object ProxyWordCreation { diff --git a/src/main/scala/org/spot/utilities/CountryCodes.scala b/src/main/scala/org/apache/spot/utilities/CountryCodes.scala similarity index 97% rename from src/main/scala/org/spot/utilities/CountryCodes.scala rename to src/main/scala/org/apache/spot/utilities/CountryCodes.scala index 349c880..b57b031 100644 --- a/src/main/scala/org/spot/utilities/CountryCodes.scala +++ b/src/main/scala/org/apache/spot/utilities/CountryCodes.scala @@ -1,4 +1,4 @@ -package org.spot.utilities +package org.apache.spot.utilities object CountryCodes { diff --git a/src/main/scala/org/spot/utilities/DataFrameUtils.scala b/src/main/scala/org/apache/spot/utilities/DataFrameUtils.scala similarity index 96% rename from src/main/scala/org/spot/utilities/DataFrameUtils.scala rename to src/main/scala/org/apache/spot/utilities/DataFrameUtils.scala index 463f3f0..d5af6ee 100644 --- a/src/main/scala/org/spot/utilities/DataFrameUtils.scala +++ b/src/main/scala/org/apache/spot/utilities/DataFrameUtils.scala @@ -1,4 +1,4 @@ -package org.spot.utilities +package org.apache.spot.utilities import org.apache.spark.sql.{DataFrame, Row} diff --git a/src/main/scala/org/spot/utilities/DomainProcessor.scala b/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala similarity index 99% rename from src/main/scala/org/spot/utilities/DomainProcessor.scala rename to src/main/scala/org/apache/spot/utilities/DomainProcessor.scala index 95c0e23..28595e4 100644 --- a/src/main/scala/org/spot/utilities/DomainProcessor.scala +++ b/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala @@ -1,4 +1,4 @@ -package org.spot.utilities +package org.apache.spot.utilities import org.apache.spark.broadcast.Broadcast diff --git a/src/main/scala/org/spot/utilities/Entropy.scala b/src/main/scala/org/apache/spot/utilities/Entropy.scala similarity index 93% rename from src/main/scala/org/spot/utilities/Entropy.scala rename to src/main/scala/org/apache/spot/utilities/Entropy.scala index c9a2b87..5e2eb42 100644 --- a/src/main/scala/org/spot/utilities/Entropy.scala +++ b/src/main/scala/org/apache/spot/utilities/Entropy.scala @@ -1,4 +1,4 @@ -package org.spot.utilities +package org.apache.spot.utilities import scala.math._ diff --git a/src/main/scala/org/spot/utilities/Quantiles.scala b/src/main/scala/org/apache/spot/utilities/Quantiles.scala similarity index 99% rename from src/main/scala/org/spot/utilities/Quantiles.scala rename to src/main/scala/org/apache/spot/utilities/Quantiles.scala index e1603e1..09d2d3c 100644 --- a/src/main/scala/org/spot/utilities/Quantiles.scala +++ b/src/main/scala/org/apache/spot/utilities/Quantiles.scala @@ -1,4 +1,4 @@ -package org.spot.utilities +package org.apache.spot.utilities import org.apache.spark.rdd.RDD diff --git a/src/main/scala/org/spot/utilities/TimeUtilities.scala b/src/main/scala/org/apache/spot/utilities/TimeUtilities.scala similarity index 89% rename from src/main/scala/org/spot/utilities/TimeUtilities.scala rename to src/main/scala/org/apache/spot/utilities/TimeUtilities.scala index bdf94a5..c8e1030 100644 --- a/src/main/scala/org/spot/utilities/TimeUtilities.scala +++ b/src/main/scala/org/apache/spot/utilities/TimeUtilities.scala @@ -1,4 +1,4 @@ -package org.spot.utilities +package org.apache.spot.utilities object TimeUtilities { diff --git a/src/main/scala/org/spot/utilities/TopDomains.scala b/src/main/scala/org/apache/spot/utilities/TopDomains.scala similarity index 88% rename from src/main/scala/org/spot/utilities/TopDomains.scala rename to src/main/scala/org/apache/spot/utilities/TopDomains.scala index f6eea88..9329f14 100644 --- a/src/main/scala/org/spot/utilities/TopDomains.scala +++ b/src/main/scala/org/apache/spot/utilities/TopDomains.scala @@ -1,4 +1,4 @@ -package org.spot.utilities +package org.apache.spot.utilities import scala.io.Source diff --git a/src/test/scala/org/spot/DNSWordCreationTest.scala b/src/test/scala/org/apache/spot/DNSWordCreationTest.scala similarity index 59% rename from src/test/scala/org/spot/DNSWordCreationTest.scala rename to src/test/scala/org/apache/spot/DNSWordCreationTest.scala index 064863a..1b02333 100644 --- a/src/test/scala/org/spot/DNSWordCreationTest.scala +++ b/src/test/scala/org/apache/spot/DNSWordCreationTest.scala @@ -1,11 +1,11 @@ -package org.spot +package org.apache.spot import javax.swing.text.Utilities -import org.spot.dns.{DNSSuspiciousConnectsAnalysis, DNSWordCreation} -import org.spot.testutils.TestingSparkContextFlatSpec -import org.spot.utilities.{CountryCodes, Entropy, TopDomains} +import org.apache.spot.dns.{DNSSuspiciousConnectsAnalysis, DNSWordCreation} +import org.apache.spot.testutils.TestingSparkContextFlatSpec +import org.apache.spot.utilities.{CountryCodes, Entropy, TopDomains} import org.scalatest.Matchers class DNSWordCreationTest extends TestingSparkContextFlatSpec with Matchers { diff --git a/src/test/scala/org/spot/FlowWordCreationTest.scala b/src/test/scala/org/apache/spot/FlowWordCreationTest.scala similarity index 99% rename from src/test/scala/org/spot/FlowWordCreationTest.scala rename to src/test/scala/org/apache/spot/FlowWordCreationTest.scala index a8081e9..7e199b2 100644 --- a/src/test/scala/org/spot/FlowWordCreationTest.scala +++ b/src/test/scala/org/apache/spot/FlowWordCreationTest.scala @@ -1,7 +1,7 @@ -package org.spot +package org.apache.spot -import org.spot.netflow.FlowWordCreation +import org.apache.spot.netflow.FlowWordCreation import org.scalatest.{FlatSpec, Matchers} class FlowWordCreationTest extends FlatSpec with Matchers { diff --git a/src/test/scala/org/spot/QuantilesTest.scala b/src/test/scala/org/apache/spot/QuantilesTest.scala similarity index 98% rename from src/test/scala/org/spot/QuantilesTest.scala rename to src/test/scala/org/apache/spot/QuantilesTest.scala index 562f864..43fd47b 100644 --- a/src/test/scala/org/spot/QuantilesTest.scala +++ b/src/test/scala/org/apache/spot/QuantilesTest.scala @@ -1,7 +1,7 @@ -package org.spot +package org.apache.spot import org.apache.spark.rdd.RDD -import org.spot.utilities.Quantiles +import org.apache.spot.utilities.Quantiles import org.scalatest.Matchers import testutils.TestingSparkContextFlatSpec diff --git a/src/test/scala/org/spot/SpotLDACWrapperTest.scala b/src/test/scala/org/apache/spot/SpotLDACWrapperTest.scala similarity index 98% rename from src/test/scala/org/spot/SpotLDACWrapperTest.scala rename to src/test/scala/org/apache/spot/SpotLDACWrapperTest.scala index fb9bd62..9a31b4b 100644 --- a/src/test/scala/org/spot/SpotLDACWrapperTest.scala +++ b/src/test/scala/org/apache/spot/SpotLDACWrapperTest.scala @@ -1,7 +1,7 @@ -package org.spot +package org.apache.spot -import org.spot.SpotLDACWrapper.SpotLDACInput -import org.spot.testutils.TestingSparkContextFlatSpec +import org.apache.spot.SpotLDACWrapper.SpotLDACInput +import org.apache.spot.testutils.TestingSparkContextFlatSpec import org.scalatest.Matchers diff --git a/src/test/scala/org/spot/testutils/TestingSparkContext.scala b/src/test/scala/org/apache/spot/testutils/TestingSparkContext.scala similarity index 98% rename from src/test/scala/org/spot/testutils/TestingSparkContext.scala rename to src/test/scala/org/apache/spot/testutils/TestingSparkContext.scala index 1c50c91..7162ac6 100644 --- a/src/test/scala/org/spot/testutils/TestingSparkContext.scala +++ b/src/test/scala/org/apache/spot/testutils/TestingSparkContext.scala @@ -1,4 +1,4 @@ -package org.spot.testutils +package org.apache.spot.testutils import java.util.Date diff --git a/src/test/scala/org/spot/testutils/TestingSparkContextFlatSpec.scala b/src/test/scala/org/apache/spot/testutils/TestingSparkContextFlatSpec.scala similarity index 95% rename from src/test/scala/org/spot/testutils/TestingSparkContextFlatSpec.scala rename to src/test/scala/org/apache/spot/testutils/TestingSparkContextFlatSpec.scala index fa0dd94..00f8136 100644 --- a/src/test/scala/org/spot/testutils/TestingSparkContextFlatSpec.scala +++ b/src/test/scala/org/apache/spot/testutils/TestingSparkContextFlatSpec.scala @@ -2,7 +2,7 @@ * THIS CODE WAS COPIED DIRECTLY FROM THE OPEN SOURCE PROJECT TAP (Trusted Analytics Platform) * which has an Apache V2.0 */ -package org.spot.testutils +package org.apache.spot.testutils import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext diff --git a/src/test/scala/org/spot/testutils/TestingSparkContextFunSuite.scala b/src/test/scala/org/apache/spot/testutils/TestingSparkContextFunSuite.scala similarity index 95% rename from src/test/scala/org/spot/testutils/TestingSparkContextFunSuite.scala rename to src/test/scala/org/apache/spot/testutils/TestingSparkContextFunSuite.scala index 49b3f05..7408a83 100644 --- a/src/test/scala/org/spot/testutils/TestingSparkContextFunSuite.scala +++ b/src/test/scala/org/apache/spot/testutils/TestingSparkContextFunSuite.scala @@ -3,7 +3,7 @@ * which has an Apache V2.0 */ -package org.spot.testutils +package org.apache.spot.testutils import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext diff --git a/src/test/scala/org/spot/testutils/TestingSparkContextWordSpec.scala b/src/test/scala/org/apache/spot/testutils/TestingSparkContextWordSpec.scala similarity index 95% rename from src/test/scala/org/spot/testutils/TestingSparkContextWordSpec.scala rename to src/test/scala/org/apache/spot/testutils/TestingSparkContextWordSpec.scala index 5169db3..02f6456 100644 --- a/src/test/scala/org/spot/testutils/TestingSparkContextWordSpec.scala +++ b/src/test/scala/org/apache/spot/testutils/TestingSparkContextWordSpec.scala @@ -3,7 +3,7 @@ * which has an Apache V2.0 */ -package org.spot.testutils +package org.apache.spot.testutils import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext diff --git a/src/test/scala/org/spot/utilities/DomainProcessorTest.scala b/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala similarity index 96% rename from src/test/scala/org/spot/utilities/DomainProcessorTest.scala rename to src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala index 6de4e3e..c1d93f0 100644 --- a/src/test/scala/org/spot/utilities/DomainProcessorTest.scala +++ b/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala @@ -1,8 +1,8 @@ -package org.spot.utilities +package org.apache.spot.utilities -import org.spot.testutils.TestingSparkContextFlatSpec +import org.apache.spot.testutils.TestingSparkContextFlatSpec import org.scalatest.{FunSuite, Matchers} -import org.spot.utilities.DomainProcessor._ +import org.apache.spot.utilities.DomainProcessor._ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers { From 1692d2101cb1fbb424e34ae0e8cd395a1de0d502 Mon Sep 17 00:00:00 2001 From: nlsegerl Date: Thu, 29 Sep 2016 14:17:38 -0700 Subject: [PATCH 6/6] spot_naming on second thought, upload.sh won't be added to git at this time --- upload.sh | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100755 upload.sh diff --git a/upload.sh b/upload.sh deleted file mode 100755 index 72d29fb..0000000 --- a/upload.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -USER=$1 -CLUSTER=$2 -DIRNAME=$3 - -# any optional arguments you wish to pass directly to rsync go in position 4 - -# exclude the hidden files, target, lib, project, src... except the assembled jar -rsync $4 -v -a --include='target' --include='target/scala-2.10' --include='target/scala-2.10/*.jar' \ - --include='spot-lda-c' --include='spot-lda-c/*' --include='*.py' --include='*.sh' --include 'top-1m.csv' \ - --exclude='*' . $USER@$CLUSTER:~/$DIRNAME