Merge pull request #487 from sunbeam-labs/485-fix-filter_reads-and-remove_low_complexity-memory-inefficiency

Ulthran · web-flow · commit 4ff9bd9a24e6 · 2024-04-23T11:53:28.000-04:00
485 fix filter reads and remove low complexity memory inefficiency
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -91,7 +91,7 @@ jobs:
             with:
               token: ${{ secrets.GITHUB_TOKEN }}
               id: ${{ github.event.release.id }}
-              body: "### sunbeamlabs/sunbeam\n${{ needs.push-dockerhub.outputs.sunbeam_package_versions }}\n### sunbeamlabs/sunbeam:slim\n${{ needs.push-dockerhub.outputs.sunbeam_package_versions_slim }}\n### sunbeamlabs/cutadapt\n${{ needs.push-dockerhub.outputs.cutadapt_package_versions }}\n### sunbeamlabs/komplexity\n${{ needs.push-dockerhub.outputs.komplexity_package_versions }}\n### sunbeamlabs/qc\n${{ needs.push-dockerhub.outputs.qc_package_versions }}\n### sunbeamlabs/reports\n${{ needs.push-dockerhub.outputs.reports_package_versions }}"
+              body: "**sunbeamlabs/sunbeam**: ${{ needs.push-dockerhub.outputs.sunbeam_package_versions }}\n**sunbeamlabs/sunbeam:slim**: ${{ needs.push-dockerhub.outputs.sunbeam_package_versions_slim }}\n**sunbeamlabs/cutadapt**: ${{ needs.push-dockerhub.outputs.cutadapt_package_versions }}\n**sunbeamlabs/komplexity**: ${{ needs.push-dockerhub.outputs.komplexity_package_versions }}\n**sunbeamlabs/qc**: ${{ needs.push-dockerhub.outputs.qc_package_versions }}\n**sunbeamlabs/reports**: ${{ needs.push-dockerhub.outputs.reports_package_versions }}"
               replacebody: false
         
     run-integration-tests:
diff --git a/install.sh b/install.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 __conda_url=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-__version_tag=$(if git describe --tags >/dev/null 2>&1 ; then git describe --tags; else echo v4.5.2; fi) # <--- Update this on each version release
+__version_tag=$(if git describe --tags >/dev/null 2>&1 ; then git describe --tags; else echo v4.6.0; fi) # <--- Update this on each version release
 __version_tag="${__version_tag:1}" # Remove the 'v' prefix
 
 read -r -d '' __usage <<-'EOF'
diff --git a/src/sunbeamlib/parse.py b/src/sunbeamlib/parse.py
@@ -48,7 +48,7 @@ def write_fasta(record: Tuple[str, str], f: TextIO) -> None:
 
 
 def parse_fastq(f: TextIO) -> Iterator[Tuple[str, str, str, str]]:
-    for g in grouper(f.readlines(), 4):
+    for g in grouper(f, 4):
         header_str = g[0][1:].strip()
         seq_str = g[1].strip()
         plus_str = g[2].strip()
@@ -58,11 +58,8 @@ def parse_fastq(f: TextIO) -> Iterator[Tuple[str, str, str, str]]:
 
 
 def write_fastq(record: Tuple[str, str, str, str], f: TextIO) -> None:
-    for i, l in enumerate(record):
-        if i == 0:
-            f.write(f"@{l}\n")
-        else:
-            f.write(f"{l}\n")
+    s = f"@{record[0]}\n{record[1]}\n{record[2]}\n{record[3]}\n"
+    f.write(s)
 
 
 def write_many_fastq(record_list: List[Tuple[str, str, str, str]], f: TextIO) -> None:
diff --git a/src/sunbeamlib/qc.py b/src/sunbeamlib/qc.py
@@ -3,24 +3,20 @@
 """
 
 import gzip
+import sys
 from pathlib import Path
 from sunbeamlib.parse import parse_fastq, write_fastq
-from typing import List, TextIO
+from typing import Set, TextIO
 
 
-from typing import List, TextIO
-from pathlib import Path
-import gzip
-
-
-def filter_ids(fp_in: Path, fp_out: Path, ids: List[str], log: TextIO) -> None:
+def filter_ids(fp_in: Path, fp_out: Path, ids: Set[str], log: TextIO) -> None:
     """
-    Filter FASTQ records based on a list of IDs.
+    Filter FASTQ records based on a set of IDs to remove.
 
     Args:
         fp_in (Path): Path to the input FASTQ file.
         fp_out (Path): Path to the output FASTQ file.
-        ids (List[str]): List of IDs to filter.
+        ids (Set[str]): Set of IDs to filter.
         log (TextIO): TextIO object to write log messages.
 
     Returns:
@@ -31,15 +27,14 @@ def filter_ids(fp_in: Path, fp_out: Path, ids: List[str], log: TextIO) -> None:
 
     """
     with gzip.open(fp_in, "rt") as f_in, gzip.open(fp_out, "wt") as f_out:
-        ids_set = set(ids)
-        num_ids = len(ids_set)
+        num_ids = len(ids)
         counter = 0
         counter_kept = 0
         for record in parse_fastq(f_in):
             counter += 1
             record = (remove_pair_id(record[0], log), record[1], record[2], record[3])
-            if record[0] in ids_set:
-                ids_set.remove(record[0])
+            if record[0] in ids:
+                ids.remove(record[0])
             else:
                 counter_kept += 1
                 write_fastq(record, f_out)
@@ -48,19 +43,19 @@ def filter_ids(fp_in: Path, fp_out: Path, ids: List[str], log: TextIO) -> None:
             log.write(
                 f"ERROR: Mismatch (Removed: {counter - counter_kept}, Supposed to remove: {num_ids})\n"
             )
-            log.write(f"IDs not found: {ids_set}\n")
+            log.write(f"IDs not found: {ids}\n")
             assert (
                 False
             ), f"ERROR: Mismatch (Removed: {counter - counter_kept}, Supposed to remove: {num_ids})"
 
-        if len(ids_set) > 0:
-            log.write(f"WARNING: {len(ids_set)} ids not found in FASTQ\n")
-            log.write(f"IDs not found: {ids_set}\n")
+        if len(ids) > 0:
+            log.write(f"WARNING: {len(ids)} ids not found in FASTQ\n")
+            log.write(f"IDs not found: {ids}\n")
         else:
             log.write("IDs list empty, finished filtering\n")
 
 
-def remove_pair_id(id: str, log: TextIO) -> str:
+def remove_pair_id(id: str, log: TextIO = sys.stdout) -> str:
     """
     Removes the pair identifier from the given ID.
 
diff --git a/workflow/scripts/filter_reads.py b/workflow/scripts/filter_reads.py
@@ -9,22 +9,20 @@
 from sunbeamlib.parse import parse_fastq, write_fastq
 
 
-def count_host_reads(fp: str, hostdict: dict, net_hostlist: set):
+def count_host_reads(fp: str, hostdict: dict) -> set:
     hostname = os.path.basename(os.path.dirname(fp))
     hostcts = int(sp.getoutput("cat {} | wc -l".format(fp)).strip())
     hostdict[hostname] = hostcts
 
     with open(fp) as f:
-        for l in f.readlines():
-            net_hostlist.add(l)  # Only adds unique ids
+        return set(l.strip() for l in f.readlines())
 
 
-def calculate_counts(fp: str, net_hostlist: set) -> tuple:
+def calculate_counts(fp: str, len_hostlist: int) -> tuple:
     original = int(str(sp.getoutput("zcat {} | wc -l".format(fp))).strip()) // 4
-    host = len(net_hostlist)
-    nonhost = int(original - host)
+    nonhost = int(original - len_hostlist)
 
-    return host, nonhost
+    return len_hostlist, nonhost
 
 
 def write_log(f: TextIOWrapper, hostdict: OrderedDict, host: int, nonhost: int):
@@ -39,28 +37,30 @@ def write_log(f: TextIOWrapper, hostdict: OrderedDict, host: int, nonhost: int):
     done = False
     net_hostlist = set()
     for hostid in sorted(snakemake.input.hostids):
-        count_host_reads(hostid, hostdict, net_hostlist)
+        net_hostlist.update(count_host_reads(hostid, hostdict))
 
-    host, nonhost = calculate_counts(snakemake.input.reads, net_hostlist)
+    host, nonhost = calculate_counts(snakemake.input.reads, len(net_hostlist))
 
+    # Check for empty host reads file
     with open(snakemake.input.hostreads) as f:
-        if not f.readlines():
+        # TODO: Remove aggregate_reads rule and just handle the host ids files here
+        if not f.readline():
             s = f"WARNING: {snakemake.input.hostreads} is empty, skipping...\n"
             l.write(s)
             sys.stderr.write(s)
             shutil.copyfile(snakemake.input.reads, snakemake.output.reads)
             done = True
 
+    # Perform filtering if host reads file is not empty
     if not done:
         with gzip.open(snakemake.input.reads, "rt") as f_in, gzip.open(
             snakemake.output.reads, "wt"
-        ) as f_out, open(snakemake.input.hostreads) as f_ids:
-            ids = {k.strip(): 1 for k in f_ids.readlines()}
+        ) as f_out:
             for header_str, seq_str, plus_str, quality_str in parse_fastq(f_in):
                 parsed_header = (
                     header_str.split(" ")[0].replace("/1", "").replace("/2", "")
                 )
-                if not parsed_header in ids:
+                if not parsed_header in net_hostlist:
                     write_fastq([header_str, seq_str, plus_str, quality_str], f_out)
 
         # Check that the output file is about the right size given the number of ids removed
diff --git a/workflow/scripts/remove_low_complexity.py b/workflow/scripts/remove_low_complexity.py
@@ -1,9 +1,8 @@
 from sunbeamlib.qc import filter_ids, remove_pair_id
 
 with open(snakemake.log[0], "w") as log:
-    ids = []
     with open(snakemake.input.ids) as f:
-        ids = [remove_pair_id(id, log) for id in f.readlines()]
+        ids = set(remove_pair_id(id, log) for id in f.readlines())
     log.write(f"Num Komplexity IDs to be filtered: {len(ids)}\n")
 
     filter_ids(snakemake.input.reads, snakemake.output[0], ids, log)