diff --git a/doc/examples/pairtools_phase_walkthrough.ipynb b/doc/examples/pairtools_phase_walkthrough.ipynb index c1116fa4..2d72c016 100644 --- a/doc/examples/pairtools_phase_walkthrough.ipynb +++ b/doc/examples/pairtools_phase_walkthrough.ipynb @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "id": "9ec0743f-a299-43f0-b568-7e963ed95df8", "metadata": { "tags": [ @@ -112,16 +112,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2024-03-18 13:18:25-- https://ftp.ensembl.org/pub/release-68/fasta/mus_musculus/dna/Mus_musculus.GRCm38.68.dna_sm.toplevel.fa.gz\n", + "--2025-08-19 15:44:18-- https://ftp.ensembl.org/pub/release-68/fasta/mus_musculus/dna/Mus_musculus.GRCm38.68.dna_sm.toplevel.fa.gz\n", "Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.169\n", "Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.169|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 861993605 (822M) [application/x-gzip]\n", "Saving to: ‘Mus_musculus.GRCm38.68.dna_sm.toplevel.fa.gz’\n", "\n", - "Mus_musculus.GRCm38 100%[===================>] 822.06M 2.43MB/s in 5m 35s \n", + "Mus_musculus.GRCm38 100%[===================>] 822.06M 145MB/s in 6.4s \n", "\n", - "2024-03-18 13:24:01 (2.45 MB/s) - ‘Mus_musculus.GRCm38.68.dna_sm.toplevel.fa.gz’ saved [861993605/861993605]\n", + "2025-08-19 15:44:25 (128 MB/s) - ‘Mus_musculus.GRCm38.68.dna_sm.toplevel.fa.gz’ saved [861993605/861993605]\n", "\n" ] } @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "id": "79e6471d", "metadata": {}, "outputs": [], @@ -161,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "4a347a3b-2ee7-4824-a209-8377edddf640", "metadata": { "tags": [ @@ -173,14 +173,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2024-03-18 13:18:14-- https://ftp.ebi.ac.uk/pub/databases/mousegenomes/REL-1505-SNPs_Indels/strain_specific_vcfs/CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", + "--2025-08-19 15:47:05-- https://ftp.ebi.ac.uk/pub/databases/mousegenomes/REL-1505-SNPs_Indels/strain_specific_vcfs/CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", "Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.165\n", "Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.165|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 785849127 (749M) [application/x-gzip]\n", - "Saving to: ‘CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz.2’\n", + "Saving to: ‘CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz’\n", "\n", - " CAST_EiJ.mgp.v5.sn 0%[ ] 4.82M 2.42MB/s ^C\n" + "CAST_EiJ.mgp.v5.snp 100%[===================>] 749.44M 114MB/s in 6.5s \n", + "\n", + "2025-08-19 15:47:12 (115 MB/s) - ‘CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz’ saved [785849127/785849127]\n", + "\n" ] } ], @@ -198,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "84cebce3-29c6-42df-98bf-5388a51fb268", "metadata": {}, "outputs": [], @@ -216,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "id": "848c9fe5-a632-4139-ba56-60871d8d1eb4", "metadata": { "tags": [] @@ -270,6 +273,7 @@ "Warning: Sequence \"JH584301.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", "Warning: Sequence \"GL456233.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", "Warning: Sequence \"JH584299.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", + "Applied 21340620 variants\n", "Warning: Sequence \"JH584295.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", "Warning: Sequence \"JH584292.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", "Warning: Sequence \"GL456368.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", @@ -313,7 +317,8 @@ "Warning: Sequence \"GL456211.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", "Warning: Sequence \"JH584301.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", "Warning: Sequence \"GL456233.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", - "Warning: Sequence \"JH584299.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n" + "Warning: Sequence \"JH584299.1\" not in CAST_EiJ.mgp.v5.snps.dbSNP142.vcf.gz\n", + "Applied 22613814 variants\n" ] } ], @@ -331,8 +336,7 @@ "id": "af1f6027", "metadata": {}, "source": [ - "Note that some of these inserted variants may change the total number of nucleotides. This would introduce differences between the coordinate systems of two homologs and complicate downstream analyses. Thus, to make your analyses simpler.you may want insert only single-nuleotide variants and exclude \n", - "by using `--include` parameter of `bcftools consensus` (e.g. `--include '(STRLEN(REF)=1) & (STRLEN(ALT[0])=1)'`).\n", + "Note that some of these inserted variants may change the total number of nucleotides. This would introduce differences between the coordinate systems of two homologs and complicate downstream analyses. Thus, to make your analyses simpler you may want insert only single-nuleotide variants and exclude indels by using `--include` parameter of `bcftools consensus` (e.g. `--include '(STRLEN(REF)=1) & (STRLEN(ALT[0])=1)'`).\n", "This will make sure that the genomic coorditates correspond between the haplotypes. \n", "Correspondence of coordinates is not a requirement, but might be important for downstream analysis. " ] @@ -355,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, "id": "92ff8a4f-2115-4131-8c4a-cbd040dcdffb", "metadata": { "tags": [ @@ -367,7 +371,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[bwa_index] Pack FASTA... 62.34 sec\n", + "[bwa_index] Pack FASTA... 54.68 sec\n", "[bwa_index] Construct BWT for the packed sequence...\n", "[BWTIncCreate] textLength=10923487096, availableWord=780616804\n", "[BWTIncConstructFromPacked] 10 iterations done. 99999992 characters processed.\n", @@ -487,13 +491,13 @@ "[BWTIncConstructFromPacked] 1150 iterations done. 10903777896 characters processed.\n", "[BWTIncConstructFromPacked] 1160 iterations done. 10923487096 characters processed.\n", "[bwt_gen] Finished constructing BWT in 1160 iterations.\n", - "[bwa_index] 4810.53 seconds elapse.\n", - "[bwa_index] Update BWT... 27.18 sec\n", - "[bwa_index] Pack forward-only FASTA... 48.19 sec\n", - "[bwa_index] Construct SA from BWT and Occ... 1602.31 sec\n", - "[main] Version: 0.7.17-r1188\n", + "[bwa_index] 4879.55 seconds elapse.\n", + "[bwa_index] Update BWT... 24.77 sec\n", + "[bwa_index] Pack forward-only FASTA... 41.98 sec\n", + "[bwa_index] Construct SA from BWT and Occ... 1642.45 sec\n", + "[main] Version: 0.7.19-r1273\n", "[main] CMD: bwa index GRCm38_EiJ_snpsonly.fa.gz\n", - "[main] Real time: 6563.846 sec; CPU: 6550.547 sec\n" + "[main] Real time: 6748.211 sec; CPU: 6643.429 sec\n" ] } ], @@ -513,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "id": "69489018-edde-4aa0-b7ac-7c7b4351764c", "metadata": {}, "outputs": [], @@ -531,22 +535,31 @@ "source": [ "## Download data\n", "\n", - "Uncomment the `--minSpotId` and `--maxSpotId` if you want to run the small test instead of full run." + "Comment out the `--minSpotId` and `--maxSpotId` if you want to run the whole dataet instead of a smaller test." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 64, "id": "f4e310c0-2d16-4e7d-87d7-44feec8e6256", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Read 1000000 spots for SRR8811373\n", + "Written 1000000 spots for SRR8811373\n" + ] + } + ], "source": [ - "! fastq-dump SRR8811373 --gzip --split-spot --split-3 # --minSpotId 0 --maxSpotId 1000000" + "! fastq-dump SRR8811373 --gzip --split-spot --split-3 --minSpotId 0 --maxSpotId 1000000" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 65, "id": "571e94fb-3dec-4042-9e21-6c39802ed8df", "metadata": {}, "outputs": [ @@ -600,7 +613,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 66, "id": "12f8a13d-fba6-45f7-8112-291fb883d7d0", "metadata": { "tags": [ @@ -614,7 +627,7 @@ "text": [ "[M::bwa_idx_load_from_disk] read 0 ALT contigs\n", "[M::process] read 331126 sequences (50000026 bp)...\n", - "[M::process] read 127590 sequences (19265939 bp)...\n", + "[M::process] read 331126 sequences (50000026 bp)...\n", "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (1, 238, 0, 0)\n", "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n", @@ -624,19 +637,71 @@ "[M::mem_pestat] low and high boundaries for proper pairs: (1, 354)\n", "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", - "[M::mem_process_seqs] Processed 331126 reads in 276.310 CPU sec, 55.358 real sec\n", - "[W::bseq_read] the 1st file has fewer sequences.\n", - "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 98, 0, 0)\n", + "[M::mem_process_seqs] Processed 331126 reads in 292.355 CPU sec, 58.890 real sec\n", + "[M::process] read 331126 sequences (50000026 bp)...\n", + "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (1, 239, 1, 0)\n", "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n", - "[M::mem_pestat] (25, 50, 75) percentile: (73, 107, 164)\n", - "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 346)\n", - "[M::mem_pestat] mean and std.dev: (118.69, 58.74)\n", - "[M::mem_pestat] low and high boundaries for proper pairs: (1, 437)\n", + "[M::mem_pestat] (25, 50, 75) percentile: (72, 98, 151)\n", + "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 309)\n", + "[M::mem_pestat] mean and std.dev: (112.62, 55.45)\n", + "[M::mem_pestat] low and high boundaries for proper pairs: (1, 388)\n", "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", - "[mem_sam_pe] paired reads have different names: \"SRR8811373.22935\", \"SRR8811373.229358\"\n", - "\n" + "[M::mem_process_seqs] Processed 331126 reads in 297.579 CPU sec, 59.685 real sec\n", + "[M::process] read 331126 sequences (50000026 bp)...\n", + "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 251, 0, 2)\n", + "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", + "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n", + "[M::mem_pestat] (25, 50, 75) percentile: (67, 98, 150)\n", + "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 316)\n", + "[M::mem_pestat] mean and std.dev: (115.38, 63.80)\n", + "[M::mem_pestat] low and high boundaries for proper pairs: (1, 399)\n", + "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", + "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", + "[M::mem_process_seqs] Processed 331126 reads in 302.460 CPU sec, 60.685 real sec\n", + "[M::process] read 331126 sequences (50000026 bp)...\n", + "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 246, 0, 0)\n", + "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", + "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n", + "[M::mem_pestat] (25, 50, 75) percentile: (74, 111, 150)\n", + "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 302)\n", + "[M::mem_pestat] mean and std.dev: (118.62, 58.53)\n", + "[M::mem_pestat] low and high boundaries for proper pairs: (1, 378)\n", + "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", + "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", + "[M::mem_process_seqs] Processed 331126 reads in 305.608 CPU sec, 61.301 real sec\n", + "[M::process] read 331126 sequences (50000026 bp)...\n", + "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (1, 236, 0, 0)\n", + "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", + "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n", + "[M::mem_pestat] (25, 50, 75) percentile: (77, 105, 156)\n", + "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 314)\n", + "[M::mem_pestat] mean and std.dev: (121.80, 64.59)\n", + "[M::mem_pestat] low and high boundaries for proper pairs: (1, 393)\n", + "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", + "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", + "[M::mem_process_seqs] Processed 331126 reads in 300.629 CPU sec, 60.285 real sec\n", + "[M::process] read 13244 sequences (1999844 bp)...\n", + "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 210, 0, 0)\n", + "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", + "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n", + "[M::mem_pestat] (25, 50, 75) percentile: (65, 101, 146)\n", + "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (1, 308)\n", + "[M::mem_pestat] mean and std.dev: (112.62, 59.51)\n", + "[M::mem_pestat] low and high boundaries for proper pairs: (1, 389)\n", + "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", + "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", + "[M::mem_process_seqs] Processed 331126 reads in 303.963 CPU sec, 61.089 real sec\n", + "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 7, 0, 0)\n", + "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", + "[M::mem_pestat] skip orientation FR as there are not enough pairs\n", + "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", + "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", + "[M::mem_process_seqs] Processed 13244 reads in 12.949 CPU sec, 2.548 real sec\n", + "[main] Version: 0.7.19-r1273\n", + "[main] CMD: bwa mem -SP -t 5 GRCm38_EiJ_snpsonly.fa.gz SRR8811373_1.fastq.gz SRR8811373_2.fastq.gz\n", + "[main] Real time: 370.109 sec; CPU: 1821.144 sec\n" ] } ], @@ -659,18 +724,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 67, "id": "efc63459-aa2f-44f5-804e-a2346d2b7820", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[E::idx_find_and_load] Could not retrieve index file for 'mapped.XA.bam'\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "pairtools parse --min-mapq 0 --add-columns XA,NM,AS,XS --drop-sam --walks-policy all \\\n", @@ -687,7 +744,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 68, "id": "e1788b32", "metadata": {}, "outputs": [ @@ -695,7 +752,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "253813\n" + "1554819\n" ] } ], @@ -715,7 +772,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 69, "id": "6c8deaee-cb68-4b53-b306-bf223523ab45", "metadata": {}, "outputs": [], @@ -736,7 +793,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 70, "id": "6aabbc13-a8d4-43f2-b388-62e7b3b576ab", "metadata": {}, "outputs": [], @@ -755,20 +812,30 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 71, "id": "9fd3b266-4faa-4fc0-974d-b0ca9bbeb961", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:py.warnings:/tungstenfs/scratch/ggiorget/ilya/develop/pairtools/pairtools/lib/stats.py:410: RuntimeWarning: invalid value encountered in divide\n", + " np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands)\n", + "\n" + ] + } + ], "source": [ "%%bash\n", - "pairtools dedup --mark-dups --extra-col-pair phase1 phase2 \\\n", + "pairtools dedup --mark-dups --extra-col-pair phase1 phase1 --extra-col-pair phase2 phase2 \\\n", " --output-dups - --output-unmapped - --output-stats phased.XA.dedup.stats \\\n", " -o phased.sorted.XA.nodup.pairs.gz phased.sorted.XA.pairs.gz" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 72, "id": "0e485084", "metadata": {}, "outputs": [ @@ -776,41 +843,41 @@ "name": "stdout", "output_type": "stream", "text": [ - "total\t232761\n", - "total_unmapped\t29132\n", - "total_single_sided_mapped\t44444\n", - "total_mapped\t159185\n", - "total_dups\t5219\n", - "total_nodups\t153966\n", - "cis\t17712\n", - "trans\t136254\n", - "pair_types/MM\t21281\n", - "pair_types/NM\t7241\n", - "pair_types/NN\t610\n", - "pair_types/NU\t19507\n", - "pair_types/MU\t22736\n", - "pair_types/UU\t153966\n", - "pair_types/UM\t2201\n", - "pair_types/DD\t5219\n", - "cis_1kb+\t9470\n", - "cis_2kb+\t9225\n", - "cis_4kb+\t8839\n", - "cis_10kb+\t8161\n", - "cis_20kb+\t7590\n", - "cis_40kb+\t7162\n", - "summary/frac_cis\t0.11503838509800865\n", - "summary/frac_cis_1kb+\t0.06150708598002156\n", - "summary/frac_cis_2kb+\t0.05991582557187951\n", - "summary/frac_cis_4kb+\t0.05740877856150059\n", - "summary/frac_cis_10kb+\t0.053005208942234\n", - "summary/frac_cis_20kb+\t0.049296597950196794\n", - "summary/frac_cis_40kb+\t0.046516763441279245\n", - "summary/frac_dups\t0.032785752426422086\n", - "summary/complexity_naive\t2374298.3333298806\n", - "chrom_freq/1/1\t4333\n", - "chrom_freq/10/10\t7537\n", - "chrom_freq/1/6\t173\n", - "chrom_freq/1/7\t121\n" + "total\t1554819\n", + "total_unmapped\t203628\n", + "total_single_sided_mapped\t403762\n", + "total_mapped\t947429\n", + "total_dups\t64192\n", + "total_nodups\t883237\n", + "cis\t820226\n", + "trans\t63011\n", + "pair_types/MM\t138812\n", + "pair_types/NM\t60607\n", + "pair_types/NN\t4209\n", + "pair_types/NU\t130260\n", + "pair_types/MU\t134901\n", + "pair_types/UM\t138601\n", + "pair_types/UU\t883237\n", + "pair_types/DD\t64192\n", + "cis_1kb+\t461344\n", + "cis_2kb+\t452308\n", + "cis_4kb+\t438309\n", + "cis_10kb+\t412148\n", + "cis_20kb+\t389158\n", + "cis_40kb+\t364240\n", + "summary/frac_cis\t0.928659012246996\n", + "summary/frac_cis_1kb+\t0.5223331902988666\n", + "summary/frac_cis_2kb+\t0.5121026406276005\n", + "summary/frac_cis_4kb+\t0.49625298758996733\n", + "summary/frac_cis_10kb+\t0.46663353097752924\n", + "summary/frac_cis_20kb+\t0.4406042772211762\n", + "summary/frac_cis_40kb+\t0.4123921438979572\n", + "summary/frac_dups\t0.06775388973738401\n", + "summary/complexity_naive\t6672183.642130476\n", + "summary/dist_freq_convergence/convergence_dist\t237137371\n", + "summary/dist_freq_convergence/strands_w_max_convergence_dist\t++\n", + "summary/dist_freq_convergence/convergence_rel_diff_threshold\t0.05\n", + "summary/dist_freq_convergence/n_cis_pairs_below_convergence_dist/++\t117254\n" ] } ], @@ -838,7 +905,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 73, "id": "727a9d2b-5977-4763-81e5-64589c067688", "metadata": {}, "outputs": [], @@ -861,7 +928,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 74, "id": "1172f899-41d6-4ca2-ab21-a283340011f8", "metadata": {}, "outputs": [ @@ -869,14 +936,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING:py.warnings:/users/anton.goloborodko/src/pairtools/pairtools/cli/stats.py:192: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "WARNING:py.warnings:/tungstenfs/scratch/ggiorget/ilya/develop/pairtools/pairtools/lib/stats.py:410: RuntimeWarning: invalid value encountered in divide\n", + " np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands)\n", + "\n", + "WARNING:py.warnings:/tungstenfs/scratch/ggiorget/ilya/develop/pairtools/pairtools/lib/stats.py:410: RuntimeWarning: invalid value encountered in divide\n", + " np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands)\n", + "\n", + "WARNING:py.warnings:/tungstenfs/scratch/ggiorget/ilya/develop/pairtools/pairtools/cli/stats.py:202: DtypeWarning: Columns (1,3) have mixed types. Specify dtype option on import or set low_memory=False.\n", " for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000):\n", "\n", - "WARNING:py.warnings:/users/anton.goloborodko/src/pairtools/pairtools/cli/stats.py:192: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "WARNING:py.warnings:/tungstenfs/scratch/ggiorget/ilya/develop/pairtools/pairtools/cli/stats.py:202: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n", " for chunk in pd.read_table(body_stream, names=cols, chunksize=100_000):\n", "\n", - "WARNING:py.warnings:/users/anton.goloborodko/src/pairtools/pairtools/lib/stats.py:880: RuntimeWarning: divide by zero encountered in double_scalars\n", - " complexity = float(nseq / seq_to_complexity) # clean np.int64 data type\n", + "WARNING:py.warnings:/tungstenfs/scratch/ggiorget/ilya/develop/pairtools/pairtools/lib/stats.py:410: RuntimeWarning: invalid value encountered in divide\n", + " np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands)\n", + "\n", + "WARNING:py.warnings:/tungstenfs/scratch/ggiorget/ilya/develop/pairtools/pairtools/lib/stats.py:410: RuntimeWarning: invalid value encountered in divide\n", + " np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands)\n", "\n" ] } @@ -901,7 +977,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 75, "id": "2567ccea", "metadata": {}, "outputs": [ @@ -909,26 +985,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "total\t574\n", + "total\t3704\n", "total_unmapped\t0\n", "total_single_sided_mapped\t0\n", - "total_mapped\t574\n", - "total_dups\t13\n", - "total_nodups\t561\n", - "cis\t535\n", - "trans\t26\n", - "pair_types/UU\t561\n", - "pair_types/DD\t13\n", - "cis_1kb+\t68\n", - "cis_2kb+\t67\n", - "cis_4kb+\t60\n", - "cis_10kb+\t56\n", - "cis_20kb+\t48\n", - "cis_40kb+\t42\n", - "summary/frac_cis\t0.9536541889483066\n", - "summary/frac_cis_1kb+\t0.12121212121212122\n", - "summary/frac_cis_2kb+\t0.11942959001782531\n", - "summary/frac_cis_4kb+\t0.10695187165775401\n" + "total_mapped\t3704\n", + "total_dups\t257\n", + "total_nodups\t3447\n", + "cis\t3305\n", + "trans\t142\n", + "pair_types/UU\t3447\n", + "pair_types/DD\t257\n", + "cis_1kb+\t342\n", + "cis_2kb+\t318\n", + "cis_4kb+\t295\n", + "cis_10kb+\t260\n", + "cis_20kb+\t227\n", + "cis_40kb+\t199\n", + "summary/frac_cis\t0.9588047577603713\n", + "summary/frac_cis_1kb+\t0.09921671018276762\n", + "summary/frac_cis_2kb+\t0.09225413402959094\n", + "summary/frac_cis_4kb+\t0.08558166521612996\n" ] } ], @@ -938,7 +1014,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 76, "id": "8f521224", "metadata": {}, "outputs": [ @@ -946,26 +1022,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "total\t485\n", + "total\t3102\n", "total_unmapped\t0\n", "total_single_sided_mapped\t0\n", - "total_mapped\t485\n", - "total_dups\t9\n", - "total_nodups\t476\n", - "cis\t394\n", - "trans\t82\n", - "pair_types/UU\t476\n", - "pair_types/DD\t9\n", - "cis_1kb+\t53\n", - "cis_2kb+\t51\n", - "cis_4kb+\t50\n", - "cis_10kb+\t44\n", - "cis_20kb+\t41\n", - "cis_40kb+\t38\n", - "summary/frac_cis\t0.8277310924369747\n", - "summary/frac_cis_1kb+\t0.11134453781512606\n", - "summary/frac_cis_2kb+\t0.10714285714285714\n", - "summary/frac_cis_4kb+\t0.10504201680672269\n" + "total_mapped\t3102\n", + "total_dups\t278\n", + "total_nodups\t2824\n", + "cis\t2389\n", + "trans\t435\n", + "pair_types/UU\t2824\n", + "pair_types/DD\t278\n", + "cis_1kb+\t328\n", + "cis_2kb+\t313\n", + "cis_4kb+\t285\n", + "cis_10kb+\t255\n", + "cis_20kb+\t244\n", + "cis_40kb+\t220\n", + "summary/frac_cis\t0.8459631728045326\n", + "summary/frac_cis_1kb+\t0.11614730878186968\n", + "summary/frac_cis_2kb+\t0.1108356940509915\n", + "summary/frac_cis_4kb+\t0.10092067988668556\n" ] } ], @@ -975,7 +1051,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 77, "id": "3bb8d589", "metadata": {}, "outputs": [ @@ -983,26 +1059,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "total\t241\n", + "total\t1384\n", "total_unmapped\t0\n", "total_single_sided_mapped\t0\n", - "total_mapped\t241\n", - "total_dups\t0\n", - "total_nodups\t241\n", - "cis\t177\n", - "trans\t64\n", - "pair_types/UU\t241\n", - "cis_1kb+\t85\n", - "cis_2kb+\t83\n", - "cis_4kb+\t81\n", - "cis_10kb+\t75\n", - "cis_20kb+\t70\n", - "cis_40kb+\t64\n", - "summary/frac_cis\t0.7344398340248963\n", - "summary/frac_cis_1kb+\t0.35269709543568467\n", - "summary/frac_cis_2kb+\t0.34439834024896265\n", - "summary/frac_cis_4kb+\t0.3360995850622407\n", - "summary/frac_cis_10kb+\t0.3112033195020747\n" + "total_mapped\t1384\n", + "total_dups\t137\n", + "total_nodups\t1247\n", + "cis\t888\n", + "trans\t359\n", + "pair_types/UU\t1247\n", + "pair_types/DD\t137\n", + "cis_1kb+\t386\n", + "cis_2kb+\t375\n", + "cis_4kb+\t355\n", + "cis_10kb+\t322\n", + "cis_20kb+\t294\n", + "cis_40kb+\t260\n", + "summary/frac_cis\t0.7121090617481957\n", + "summary/frac_cis_1kb+\t0.3095429029671211\n", + "summary/frac_cis_2kb+\t0.30072173215717724\n", + "summary/frac_cis_4kb+\t0.2846832397754611\n" ] } ], @@ -1012,7 +1088,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 78, "id": "29e85a23", "metadata": {}, "outputs": [ @@ -1020,26 +1096,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "total\t200077\n", + "total\t1320334\n", "total_unmapped\t0\n", - "total_single_sided_mapped\t42192\n", - "total_mapped\t157885\n", - "total_dups\t5197\n", - "total_nodups\t152688\n", - "cis\t52294\n", - "trans\t100394\n", - "pair_types/UU\t152688\n", - "pair_types/MU\t21277\n", - "pair_types/NU\t18827\n", - "pair_types/UM\t2088\n", - "pair_types/DD\t5197\n", - "cis_1kb+\t29483\n", - "cis_2kb+\t28885\n", - "cis_4kb+\t27905\n", - "cis_10kb+\t26158\n", - "cis_20kb+\t24601\n", - "cis_40kb+\t23101\n", - "summary/frac_cis\t0.3424892591428272\n" + "total_single_sided_mapped\t381095\n", + "total_mapped\t939239\n", + "total_dups\t63520\n", + "total_nodups\t875719\n", + "cis\t159893\n", + "trans\t715826\n", + "pair_types/NU\t125754\n", + "pair_types/MU\t126051\n", + "pair_types/UU\t875719\n", + "pair_types/UM\t129290\n", + "pair_types/DD\t63520\n", + "cis_1kb+\t87087\n", + "cis_2kb+\t85108\n", + "cis_4kb+\t81984\n", + "cis_10kb+\t76226\n", + "cis_20kb+\t71287\n", + "cis_40kb+\t66858\n", + "summary/frac_cis\t0.18258482458414171\n" ] } ], @@ -1120,21 +1196,9 @@ ], "metadata": { "kernelspec": { - "display_name": "main", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "main" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" + "name": "python3" } }, "nbformat": 4, diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py index 5dbe6fd4..fe6f197c 100644 --- a/pairtools/cli/dedup.py +++ b/pairtools/cli/dedup.py @@ -160,7 +160,10 @@ "duplicates. Can be either provided as 0-based column indices or as column " 'names (requires the "#columns" header field). The option can be provided ' "multiple times if multiple column pairs must match. " - 'Example: --extra-col-pair "phase1" "phase2". [output format option]', + 'Example for dedup of phased pairs, remove duplicates matching in columns ' + 'phase1 and (independently) in column phase2. ' + 'Note how each --extra-col-pair expects a pair of columns as input:' + '--extra-col-pair "phase1" "phase1" --extra-col-pair "phase2" "phase2". [output format option]', ) ### Input options: