diff --git a/.gitignore b/.gitignore
index ad354819..7dc60f54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@
# Netbeans files #
nb-configuration.xml
+nbaction.xml
# IntelliJ IDEA files #
.idea
@@ -43,3 +44,4 @@ dependency-reduced-pom.xml
/affiliation-organization-matching/affiliation-organization-matching-workflow/src/main/oozie/workflow.xml
/deduplication-organization/deduplication-organization-workflow/src/main/oozie/workflow.xml
+/deduplication-document-spark/deduplication-document-spark-impl/nbproject/
diff --git a/deduplication-document-spark/deduplication-document-spark-impl/nbactions.xml b/deduplication-document-spark/deduplication-document-spark-impl/nbactions.xml
new file mode 100644
index 00000000..b9137ba9
--- /dev/null
+++ b/deduplication-document-spark/deduplication-document-spark-impl/nbactions.xml
@@ -0,0 +1,73 @@
+
+
+
+ run
+
+ jar
+
+
+ process-classes
+ org.codehaus.mojo:exec-maven-plugin:1.2.1:exec
+
+
+ -classpath %classpath pl.edu.icm.coansys.document.deduplication.DeduplicateDocuments
+ java
+
+
+
+ debug
+
+ jar
+
+
+ process-classes
+ org.codehaus.mojo:exec-maven-plugin:1.2.1:exec
+
+
+ -Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -classpath %classpath pl.edu.icm.coansys.document.deduplication.DeduplicateDocuments
+ java
+ true
+
+
+
+ profile
+
+ jar
+
+
+ process-classes
+ org.codehaus.mojo:exec-maven-plugin:1.2.1:exec
+
+
+ -classpath %classpath pl.edu.icm.coansys.document.deduplication.DeduplicateDocuments
+ java
+
+
+
+ CUSTOM-scala:run
+ scala:run
+
+ scala:run
+
+
+
+ CUSTOM-RunSmall
+ RunSmall
+
+ scala:run
+
+
+ test|test2
+
+
+
+
+ CUSTOM-clean,build,upload
+ clean,build,upload
+
+ clean
+ install
+ wagon:upload-single
+
+
+
diff --git a/deduplication-document-spark/deduplication-document-spark-impl/pom.xml b/deduplication-document-spark/deduplication-document-spark-impl/pom.xml
new file mode 100644
index 00000000..4abc9031
--- /dev/null
+++ b/deduplication-document-spark/deduplication-document-spark-impl/pom.xml
@@ -0,0 +1,198 @@
+
+
+ 4.0.0
+
+ pl.edu.icm.coansys
+ deduplication-document-spark
+ 1.11-SNAPSHOT
+
+
+ deduplication-document-spark-impl
+ jar
+ Deduplication - Document - SparkVersion - Implementation
+
+
+ GNU AFFERO GENERAL PUBLIC LICENSE, Version 3 (AGPL-3.0)
+ http://opensource.org/licenses/AGPL-3.0
+
+
+
+
+ ssh-cypisek
+ scpexe://cypisek/jobs
+
+
+
+
+ src/main/scala
+ src/test/scala
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+
+
+ compile
+ testCompile
+
+
+
+
+ -dependencyfile
+ ${project.build.directory}/.scala_dependencies
+
+
+
+
+
+
+
+ base
+
+ pl.edu.icm.coansys.document.deduplication.DeduplicateDocuments
+
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 2.3.2
+
+ 1.8
+ 1.8
+
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+ 2.3
+
+
+ package
+
+ shade
+
+
+
+
+
+
+ *:*
+
+ META-INF/*.SF
+ META-INF/*.DSA
+ META-INF/*.RSA
+
+
+
+
+
+ junit:junit
+ log4j:log4j:jar:
+ org.scala-lang:scala-library:jar:
+ org.apache.spark:spark-core_2.10
+ org.apache.spark:spark-sql_2.10
+ org.apache.spark:spark-streaming_2.10
+
+
+ ${project.artifactId}-${project.version}
+
+
+
+ org.codehaus.mojo
+ wagon-maven-plugin
+ 1.0-beta-3
+
+ ${project.build.directory}/${project.build.finalName}.jar
+ scp://cypisek-gw.ocean.icm.edu.pl/home/axnow/jobs/
+ dedupdocs.jar
+
+
+
+ org.scalatest
+ scalatest-maven-plugin
+ 1.0
+
+ ${project.build.directory}/surefire-reports
+ .
+ WDF TestSuite.txt
+
+
+
+ test
+
+ test
+
+
+
+
+
+
+
+
+ org.apache.maven.wagon
+ wagon-ssh
+ 2.8
+
+
+
+
+
+
+
+ ${project.groupId}
+ models
+ ${project.version}
+
+
+ ${project.groupId}
+ deduplication-document-impl
+ ${project.version}
+
+
+ org.apache.spark
+ spark-core_2.10
+
+
+ org.apache.spark
+ spark-graphx_2.10
+
+
+ javax.servlet
+ javax.servlet-api
+ 3.1.0
+ runtime
+
+
+ com.google.guava
+ guava
+ 15.0
+
+
+ com.github.scopt
+ scopt_2.10
+ 3.6.0
+
+
+ org.scalatest
+ scalatest_2.10
+ 3.0.1
+ test
+
+
+ com.holdenkarau
+ spark-testing-base_2.11
+ 1.6.0_0.7.2
+ test
+
+
+
diff --git a/deduplication-document-spark/deduplication-document-spark-impl/src/main/scala/pl/edu/icm/coansys/document/deduplication/CartesianTaskSplit.scala b/deduplication-document-spark/deduplication-document-spark-impl/src/main/scala/pl/edu/icm/coansys/document/deduplication/CartesianTaskSplit.scala
new file mode 100644
index 00000000..5404d932
--- /dev/null
+++ b/deduplication-document-spark/deduplication-document-spark-impl/src/main/scala/pl/edu/icm/coansys/document/deduplication/CartesianTaskSplit.scala
@@ -0,0 +1,99 @@
+/*
+ * This file is part of CoAnSys project.
+ * Copyright (c) 2012-2017 ICM-UW
+ *
+ * CoAnSys is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+
+ * CoAnSys is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with CoAnSys. If not, see .
+ */
+package pl.edu.icm.coansys.document.deduplication
+
+import pl.edu.icm.coansys.models.DocumentProtos.DocumentWrapper
+
+class CartesianTaskSplit(
+ val clusterId: String,
+ val taskId: String,
+ val rows: Seq[DocumentWrapper],
+ val columns: Seq[DocumentWrapper]
+) {
+ /**
+ * Generate list of clusters of the documents, where predicate is conformed, ie
+ * function passed returned true. The predicate is assumed to be
+ * symmetrical, so it is executed only once on each pair. Note, that as we
+ * expect that all the tiles will appear within the task, and the comparison
+ * operator may be expensive, only situations where row key is lesser than
+ * column key are taken into account
+ *
+ * @param equalityTest predicate which defines whether or no two elements
+ * are considered matching (typically equal)
+ * @return list of lists of keys of equal documents (documents where
+ * equalityTest returned true)
+ */
+ def processPairs(equalityTest: (DocumentWrapper, DocumentWrapper) => Boolean): Seq[Seq[String]] = {
+ return List.empty
+
+ val clusters: Seq[Seq[String]] = rows.map(row => {
+ val rkey = row.getDocumentMetadata.getKey
+ val equalColumnKeys = columns.filter(rkey < _.getDocumentMetadata.getKey)
+ .filter(equalityTest(row, _))
+ .map(_.getDocumentMetadata.getKey)
+ equalColumnKeys :+ rkey
+ }).filter(_.size > 1)
+ CartesianTaskSplit.coalesceClusters(clusters)
+ }
+
+}
+
+object CartesianTaskSplit {
+ val log = org.slf4j.LoggerFactory.getLogger(getClass().getName())
+ /**
+ * Combine clusters which have non-empty intersection, so result will be
+ * only separate lists.
+ *
+ * @param clusters lists to combine
+ * @return list of the separate clusters, obtained from merging input clusters
+ */
+ def coalesceClusters(clusters: Seq[Seq[String]]): Seq[Seq[String]] = {
+ var sets = clusters.map(_.toSet[String])
+ var res = List.empty[Set[String]]
+ while (!sets.isEmpty) {
+ var current = sets.head
+ sets = sets.tail
+ var ps: (Seq[Set[String]], Seq[Set[String]]) = null
+ do {
+ ps = sets.partition(_.exists(current.contains(_)))
+ current +: ps._1.flatMap(x => x)
+ sets = ps._2
+ } while (!ps._1.isEmpty)
+ res :+ current
+ }
+ res.map(_.toSeq)
+ }
+
+ /** Split one large cluster into parallel tasks of the given size.
+ */
+ def parallelizeCluster(clusterId: String, documents: Iterable[DocumentWrapper], tileSize: Int): Seq[CartesianTaskSplit] = {
+ log.info(f"Document count: ${documents.size}, tile size $tileSize")
+ val ntiles = documents.size/tileSize + (if(documents.size % tileSize>0) 1 else 0)
+ println(f"ntiles: $ntiles")
+
+ val sdoc = documents.toVector.sorted(Ordering.by[DocumentWrapper, String](_.getDocumentMetadata.getKey))
+ val groupedDocs = sdoc.zipWithIndex.map(docidx => (docidx._2%ntiles, docidx._1)).groupBy[Int](_._1).mapValues(_.map(_._2).toVector).toVector
+ val res = groupedDocs.flatMap(kv =>
+ groupedDocs.map(kvin => new CartesianTaskSplit(
+ clusterId, f"${clusterId}_${kv._1}:${kv._2}",kv._2, kvin._2
+ )
+ )
+ )
+ res
+ }
+}
diff --git a/deduplication-document-spark/deduplication-document-spark-impl/src/main/scala/pl/edu/icm/coansys/document/deduplication/DeduplicateDocuments.scala b/deduplication-document-spark/deduplication-document-spark-impl/src/main/scala/pl/edu/icm/coansys/document/deduplication/DeduplicateDocuments.scala
new file mode 100644
index 00000000..0e4c59e7
--- /dev/null
+++ b/deduplication-document-spark/deduplication-document-spark-impl/src/main/scala/pl/edu/icm/coansys/document/deduplication/DeduplicateDocuments.scala
@@ -0,0 +1,420 @@
+/*
+ * This file is part of CoAnSys project.
+ * Copyright (c) 2012-2017 ICM-UW
+ *
+ * CoAnSys is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+
+ * CoAnSys is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with CoAnSys. If not, see .
+ */
+
+package pl.edu.icm.coansys.document.deduplication
+import scala.collection.JavaConversions._
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import java.util.function.BiPredicate
+import org.apache.hadoop.io.BytesWritable
+import org.apache.spark.SparkConf
+import pl.edu.icm.coansys.deduplication.document.voter.AuthorsVoter
+import pl.edu.icm.coansys.deduplication.document.voter.DoiVoter
+import pl.edu.icm.coansys.deduplication.document.voter.IssueVolumeVoter
+import pl.edu.icm.coansys.deduplication.document.voter.JournalVoter
+import pl.edu.icm.coansys.deduplication.document.voter.PagesVoter
+import pl.edu.icm.coansys.deduplication.document.voter.SimilarityVoter
+import pl.edu.icm.coansys.deduplication.document.voter.TitleVoter
+import pl.edu.icm.coansys.deduplication.document.voter.YearVoter
+import pl.edu.icm.coansys.document.deduplication.merge.AdvancedDuplicatesMerger
+import pl.edu.icm.coansys.document.deduplication.merge.DuplicatesMerger
+import pl.edu.icm.coansys.models.DocumentProtos
+import pl.edu.icm.coansys.models.DocumentProtos._
+import org.apache.spark.rdd.RDD
+import pl.edu.icm.coansys.deduplication.document.comparator.VotesProductComparator
+import pl.edu.icm.coansys.deduplication.document.comparator.WorkComparator
+import scala.collection.mutable.ListBuffer
+import pl.edu.icm.coansys.document.deduplication._
+import scala.collection.JavaConverters._
+
+/** Main application for the deduplication of the documents.
+ *
+ */
+object DeduplicateDocuments {
+ val log = org.slf4j.LoggerFactory.getLogger(getClass().getName())
+
+ implicit def toJavaBiPredicate[A, B](predicate: (A, B) => Boolean) =
+ new BiPredicate[A, B] {
+ def test(a: A, b: B) = predicate(a, b)
+ }
+
+ def isValidDocument(doc: DocumentWrapper): Boolean = { //todo: fix based on if return value.
+ if (doc.hasDocumentMetadata()) {
+ val md = doc.getDocumentMetadata
+ if (md.hasBasicMetadata) {
+ val bmd = md.getBasicMetadata
+ (bmd.getTitleCount() > 0 || bmd.getAuthorCount > 0 || bmd.hasDoi || bmd.hasJournal)
+ } else {
+ false
+ }
+ } else {
+ false
+ }
+ }
+
+
+ def calculateKeys(doc: DocumentMetadata, initialClusteringKeySize: Int, maximumClusteringKeySize: Int): Seq[String] = {
+ val keySizes = initialClusteringKeySize to maximumClusteringKeySize
+ var res = MultiLengthTitleKeyGenerator.generateKeys(doc)(keySizes)
+ if (res.head.isEmpty) {
+ res = Array.fill[String](keySizes.length)(doc.getKey)
+ }
+ res
+ }
+
+ /**
+ * Group items into large clusters, within which detailed analysis will be
+ * held.
+ *
+ * Items are grouped by keys generated from the normalised titles.
+ * If the cluster is too big, then longer keys are used, so smaller clusters are
+ * generated. Treshold is maximumClusterSize.
+ *
+ */
+ def prepareInitialClustering(inputDocs: RDD[(String, DocumentWrapper)], initialClusteringKeySize: Int,
+ maximumClusteringKeySize: Int, maximumClusterSize: Int): RDD[(String, Iterable[DocumentWrapper])] = {
+ log.info("Initializing cluster preparation.")
+ val keySizes = initialClusteringKeySize to maximumClusteringKeySize
+ log.info("Will use key sizes: " + keySizes.mkString(", "))
+
+ val idClusterKeys = inputDocs.mapValues(doc => calculateKeys(
+ doc.getDocumentMetadata(), initialClusteringKeySize, maximumClusteringKeySize)) //we loose documents here, ony ids are preseved
+ val clusterDoc = idClusterKeys.flatMap(kv => kv._2.map(idcluster => (idcluster, kv._1))) // (clusterId => docId)
+ val clusterSizes = idClusterKeys.flatMap(x => (x._2.map(y => (y, 1)))).reduceByKey(_ + _) //(clusterId => clusterSize)
+
+ //build rdd (docId, (clusterId, clusterSize) )
+ val docClustersWithSizes = clusterDoc.join(clusterSizes).map(p => (p._2._1, (p._1, p._2._2)))
+ //build rdd - (docId, clusterId)
+ val selectedClusters = docClustersWithSizes.reduceByKey((x, y) => {
+ if (x._2 <= maximumClusterSize) {
+ if (y._2 <= maximumClusterSize) {
+ if (x._1.length <= y._1.length) { x } else { y }
+ } else {
+ x
+ }
+ } else {
+ if (y._2 <= maximumClusterSize) {
+ y
+ } else {
+ if (x._1.length > y._1.length) { x } else { y }
+ }
+ }
+ }).mapValues(_._1)
+ inputDocs.join(selectedClusters).map(p => (p._2._2, p._2._1)).groupByKey
+ }
+
+ def buildDocumentsMerger(): DuplicatesMerger = {
+ val res = new AdvancedDuplicatesMerger
+ res.setup("")
+ res
+ }
+
+ /**
+ * Merge the documents using appropriate document merger.
+ */
+ def mergeDocuments(docs: List[DocumentWrapper]): DocumentWrapper = {
+ val merger = buildDocumentsMerger()
+ val merged = merger.merge(docs);
+ merged
+ }
+
+ /**
+ * Defines comparator according to the weights resulting from experiments.
+ *
+ * This is reimplementation of the original Spring XML bean definition, which
+ * was unnecessary complication at this moment.
+ */
+ def buildWorkComparator(): WorkComparator = {
+ val result = new VotesProductComparator;
+ result.setMinVotersWeightRequired(1.5f)
+ result.setProbabilityTreshold(0.5f)
+ result.setTresholdIncreasingVotersRequired(0.7f)
+
+ val voters = new ListBuffer[SimilarityVoter]()
+ val dv = new DoiVoter()
+ dv.setWeight(1.0f)
+ voters += dv
+ val jv = new JournalVoter()
+ jv.setWeight(0.3f)
+ jv.setDisapproveLevel(0.5f)
+ jv.setApproveLevel(0.05f)
+ voters += jv
+
+ val wivv = new IssueVolumeVoter
+ wivv.setWeight(0.3f)
+ wivv.setAbstainIfAbsent(true)
+ wivv.setSubsetResult(0.8f)
+ wivv.setPartiallyMatchResult(0.52f)
+ voters += wivv
+
+ val wpv = new PagesVoter
+ wpv.setWeight(.3f)
+ wpv.setAbstainIfAbsent(true)
+ wpv.setAbsentResult(0.6f)
+ wpv.setSubsetResult(0.75f)
+ wpv.setPartiallyMatchResult(0.64f)
+ wpv.setRemoveRepeated(true)
+ voters += wpv
+
+ val wyv = new YearVoter
+ wyv.setWeight(.3f)
+ wyv.setAbstainIfAbsent(true)
+ wyv.setAbsentResult(.52f)
+ wyv.setSubsetResult(.9f)
+ wyv.setPartiallyMatchResult(.75f)
+ wyv.setRemoveRepeated(true)
+ voters += wyv
+
+ val wtv = new TitleVoter()
+ wtv.setWeight(0.8f)
+ wtv.setDisapproveLevel(0.11f)
+ wtv.setApproveLevel(0.001f)
+ wtv.setMaxNormalizedTitleLength(90)
+ voters += wtv
+
+ val wav = new AuthorsVoter
+ wav.setWeight(0.8f)
+ wav.setDisapproveLevel(0.2f)
+ wav.setApproveLevel(0.03f)
+ voters += wav
+
+ result.setSimilarityVoters(voters)
+ result;
+ }
+
+
+ case class Config(
+ inputFile: String = "",
+ outputFile: String = "",
+ dumpClusters: Boolean = false,
+ keySizeMin: Int = 5,
+ keySizeMax: Int = 15,
+ clusterSizeMax: Int = 500,
+ tileSize: Int = 25,
+ filterInvalidDocuments: Boolean = false,
+ removeDuplicateDocuments: Boolean = false
+ )
+
+ /** Load the documents from the given sequence file, do the optional
+ * cleanups.
+ *
+ */
+ def loadDocuments( sc: SparkContext, file: String,
+ filterInvalid: Boolean, removeDoubles: Boolean):RDD[(String, DocumentWrapper)] = {
+ val rawbytes = sc.sequenceFile[String, BytesWritable](file).mapValues(_.copyBytes)
+ println("Loaded raw bytes.")
+
+ val dirtyWrappers = rawbytes.mapValues(b => DocumentProtos.DocumentWrapper.parseFrom(b))
+
+ //fix invalid documents:
+ val fixedWrappers = if (filterInvalid) {
+ val x = dirtyWrappers.filter(w => isValidDocument(w._2))
+ val afterSize = x.count;
+ val preSize = dirtyWrappers.count
+ log.info(f"Filtering invalid documents done, before filtering: $preSize and after filtering $afterSize documents left.")
+ x
+ } else {
+ dirtyWrappers
+ }
+
+ if (removeDoubles) {
+ fixedWrappers.reduceByKey((x, y) => y)
+ } else {
+ fixedWrappers
+ }
+ }
+
+ /** Debug method to printout top clusters. */
+ def printTopClusters(finalClusters:RDD[(String, Seq[String])], count:Int):Unit = {
+ val finclSizes = finalClusters.mapValues(_.size).takeOrdered(100)(Ordering[Int].on(-_._2))
+ println("Top 100 cluster sizes:")
+ finclSizes.foreach(println(_))
+ println("-----\n\n")
+
+ }
+
+
+
+
+ def main(args: Array[String]): Unit = {
+ val fixInvalidDocuments = true;
+ val removeDoubles = true;
+
+ println("Starting document deduplication")
+
+ val parser = new scopt.OptionParser[Config]("CoAnSys Deduplicate Documents") {
+ head("Deduplicate documents", "0.1")
+
+ opt[Unit]('f', "filter-invalid").action((x, c) =>
+ c.copy(filterInvalidDocuments = true)).text("filter invalid (empty) documents before run.")
+
+ opt[Unit]('d', "remove-doubles").action((x, c) =>
+ c.copy(removeDuplicateDocuments = true)).text("filter out duplicates sharing the same key before processing.")
+
+ opt[Int]("cluster-key-min").abbr("kmn").action((x, c) => c.copy(keySizeMin = x)).
+ validate(x =>
+ if (x >= 2) success
+ else failure("Value must be >=2")).
+ text("shortest valid key for cluster, defines pre-clustering. Recommended value more thab 4, minimum 2.")
+
+ opt[Int]("cluster-key-max").abbr("kmx").action((x, c) => c.copy(keySizeMax = x)).
+ validate(x =>
+ if (x >= 2 && x <= 20) success
+ else failure("Value must be >=2")).
+ text("longest valid key for cluster, during pre-clustering. Used to split large clusters. Recommended value more than min, minimum 2, max 20.")
+
+ opt[Int]("cluster-size-max").abbr("cs").action((x, c) => c.copy(clusterSizeMax = x)).
+ text("Largest acceptable cluster size during preclustering phase. If cluster exceeds algorithm attempts to use longer key if possible. Typically 400+")
+
+ opt[Int]("tile-size").abbr("ts").action((x, c) => c.copy(keySizeMax = x)).
+ validate(x =>
+ if (x >= 2) success
+ else failure("Value must be >=2")).
+ text("Size of the tile tasks used to split large clusters. Min 2, recommended approx 40")
+
+ arg[String]("").required.text("Input sequence file").action((f, c) => c.copy(inputFile = f))
+ arg[String]("