Skip to content

Commit 4ff9bd9

Browse files
authored
Merge pull request #487 from sunbeam-labs/485-fix-filter_reads-and-remove_low_complexity-memory-inefficiency
485 fix filter reads and remove low complexity memory inefficiency
2 parents a67f9be + 940d041 commit 4ff9bd9

File tree

6 files changed

+32
-41
lines changed

6 files changed

+32
-41
lines changed

.github/workflows/release.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ jobs:
9191
with:
9292
token: ${{ secrets.GITHUB_TOKEN }}
9393
id: ${{ github.event.release.id }}
94-
body: "### sunbeamlabs/sunbeam\n${{ needs.push-dockerhub.outputs.sunbeam_package_versions }}\n### sunbeamlabs/sunbeam:slim\n${{ needs.push-dockerhub.outputs.sunbeam_package_versions_slim }}\n### sunbeamlabs/cutadapt\n${{ needs.push-dockerhub.outputs.cutadapt_package_versions }}\n### sunbeamlabs/komplexity\n${{ needs.push-dockerhub.outputs.komplexity_package_versions }}\n### sunbeamlabs/qc\n${{ needs.push-dockerhub.outputs.qc_package_versions }}\n### sunbeamlabs/reports\n${{ needs.push-dockerhub.outputs.reports_package_versions }}"
94+
body: "**sunbeamlabs/sunbeam**: ${{ needs.push-dockerhub.outputs.sunbeam_package_versions }}\n**sunbeamlabs/sunbeam:slim**: ${{ needs.push-dockerhub.outputs.sunbeam_package_versions_slim }}\n**sunbeamlabs/cutadapt**: ${{ needs.push-dockerhub.outputs.cutadapt_package_versions }}\n**sunbeamlabs/komplexity**: ${{ needs.push-dockerhub.outputs.komplexity_package_versions }}\n**sunbeamlabs/qc**: ${{ needs.push-dockerhub.outputs.qc_package_versions }}\n**sunbeamlabs/reports**: ${{ needs.push-dockerhub.outputs.reports_package_versions }}"
9595
replacebody: false
9696

9797
run-integration-tests:

install.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env bash
22

33
__conda_url=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
4-
__version_tag=$(if git describe --tags >/dev/null 2>&1 ; then git describe --tags; else echo v4.5.2; fi) # <--- Update this on each version release
4+
__version_tag=$(if git describe --tags >/dev/null 2>&1 ; then git describe --tags; else echo v4.6.0; fi) # <--- Update this on each version release
55
__version_tag="${__version_tag:1}" # Remove the 'v' prefix
66

77
read -r -d '' __usage <<-'EOF'

src/sunbeamlib/parse.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def write_fasta(record: Tuple[str, str], f: TextIO) -> None:
4848

4949

5050
def parse_fastq(f: TextIO) -> Iterator[Tuple[str, str, str, str]]:
51-
for g in grouper(f.readlines(), 4):
51+
for g in grouper(f, 4):
5252
header_str = g[0][1:].strip()
5353
seq_str = g[1].strip()
5454
plus_str = g[2].strip()
@@ -58,11 +58,8 @@ def parse_fastq(f: TextIO) -> Iterator[Tuple[str, str, str, str]]:
5858

5959

6060
def write_fastq(record: Tuple[str, str, str, str], f: TextIO) -> None:
61-
for i, l in enumerate(record):
62-
if i == 0:
63-
f.write(f"@{l}\n")
64-
else:
65-
f.write(f"{l}\n")
61+
s = f"@{record[0]}\n{record[1]}\n{record[2]}\n{record[3]}\n"
62+
f.write(s)
6663

6764

6865
def write_many_fastq(record_list: List[Tuple[str, str, str, str]], f: TextIO) -> None:

src/sunbeamlib/qc.py

+13-18
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,20 @@
33
"""
44

55
import gzip
6+
import sys
67
from pathlib import Path
78
from sunbeamlib.parse import parse_fastq, write_fastq
8-
from typing import List, TextIO
9+
from typing import Set, TextIO
910

1011

11-
from typing import List, TextIO
12-
from pathlib import Path
13-
import gzip
14-
15-
16-
def filter_ids(fp_in: Path, fp_out: Path, ids: List[str], log: TextIO) -> None:
12+
def filter_ids(fp_in: Path, fp_out: Path, ids: Set[str], log: TextIO) -> None:
1713
"""
18-
Filter FASTQ records based on a list of IDs.
14+
Filter FASTQ records based on a set of IDs to remove.
1915
2016
Args:
2117
fp_in (Path): Path to the input FASTQ file.
2218
fp_out (Path): Path to the output FASTQ file.
23-
ids (List[str]): List of IDs to filter.
19+
ids (Set[str]): Set of IDs to filter.
2420
log (TextIO): TextIO object to write log messages.
2521
2622
Returns:
@@ -31,15 +27,14 @@ def filter_ids(fp_in: Path, fp_out: Path, ids: List[str], log: TextIO) -> None:
3127
3228
"""
3329
with gzip.open(fp_in, "rt") as f_in, gzip.open(fp_out, "wt") as f_out:
34-
ids_set = set(ids)
35-
num_ids = len(ids_set)
30+
num_ids = len(ids)
3631
counter = 0
3732
counter_kept = 0
3833
for record in parse_fastq(f_in):
3934
counter += 1
4035
record = (remove_pair_id(record[0], log), record[1], record[2], record[3])
41-
if record[0] in ids_set:
42-
ids_set.remove(record[0])
36+
if record[0] in ids:
37+
ids.remove(record[0])
4338
else:
4439
counter_kept += 1
4540
write_fastq(record, f_out)
@@ -48,19 +43,19 @@ def filter_ids(fp_in: Path, fp_out: Path, ids: List[str], log: TextIO) -> None:
4843
log.write(
4944
f"ERROR: Mismatch (Removed: {counter - counter_kept}, Supposed to remove: {num_ids})\n"
5045
)
51-
log.write(f"IDs not found: {ids_set}\n")
46+
log.write(f"IDs not found: {ids}\n")
5247
assert (
5348
False
5449
), f"ERROR: Mismatch (Removed: {counter - counter_kept}, Supposed to remove: {num_ids})"
5550

56-
if len(ids_set) > 0:
57-
log.write(f"WARNING: {len(ids_set)} ids not found in FASTQ\n")
58-
log.write(f"IDs not found: {ids_set}\n")
51+
if len(ids) > 0:
52+
log.write(f"WARNING: {len(ids)} ids not found in FASTQ\n")
53+
log.write(f"IDs not found: {ids}\n")
5954
else:
6055
log.write("IDs list empty, finished filtering\n")
6156

6257

63-
def remove_pair_id(id: str, log: TextIO) -> str:
58+
def remove_pair_id(id: str, log: TextIO = sys.stdout) -> str:
6459
"""
6560
Removes the pair identifier from the given ID.
6661

workflow/scripts/filter_reads.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,20 @@
99
from sunbeamlib.parse import parse_fastq, write_fastq
1010

1111

12-
def count_host_reads(fp: str, hostdict: dict, net_hostlist: set):
12+
def count_host_reads(fp: str, hostdict: dict) -> set:
1313
hostname = os.path.basename(os.path.dirname(fp))
1414
hostcts = int(sp.getoutput("cat {} | wc -l".format(fp)).strip())
1515
hostdict[hostname] = hostcts
1616

1717
with open(fp) as f:
18-
for l in f.readlines():
19-
net_hostlist.add(l) # Only adds unique ids
18+
return set(l.strip() for l in f.readlines())
2019

2120

22-
def calculate_counts(fp: str, net_hostlist: set) -> tuple:
21+
def calculate_counts(fp: str, len_hostlist: int) -> tuple:
2322
original = int(str(sp.getoutput("zcat {} | wc -l".format(fp))).strip()) // 4
24-
host = len(net_hostlist)
25-
nonhost = int(original - host)
23+
nonhost = int(original - len_hostlist)
2624

27-
return host, nonhost
25+
return len_hostlist, nonhost
2826

2927

3028
def write_log(f: TextIOWrapper, hostdict: OrderedDict, host: int, nonhost: int):
@@ -39,28 +37,30 @@ def write_log(f: TextIOWrapper, hostdict: OrderedDict, host: int, nonhost: int):
3937
done = False
4038
net_hostlist = set()
4139
for hostid in sorted(snakemake.input.hostids):
42-
count_host_reads(hostid, hostdict, net_hostlist)
40+
net_hostlist.update(count_host_reads(hostid, hostdict))
4341

44-
host, nonhost = calculate_counts(snakemake.input.reads, net_hostlist)
42+
host, nonhost = calculate_counts(snakemake.input.reads, len(net_hostlist))
4543

44+
# Check for empty host reads file
4645
with open(snakemake.input.hostreads) as f:
47-
if not f.readlines():
46+
# TODO: Remove aggregate_reads rule and just handle the host ids files here
47+
if not f.readline():
4848
s = f"WARNING: {snakemake.input.hostreads} is empty, skipping...\n"
4949
l.write(s)
5050
sys.stderr.write(s)
5151
shutil.copyfile(snakemake.input.reads, snakemake.output.reads)
5252
done = True
5353

54+
# Perform filtering if host reads file is not empty
5455
if not done:
5556
with gzip.open(snakemake.input.reads, "rt") as f_in, gzip.open(
5657
snakemake.output.reads, "wt"
57-
) as f_out, open(snakemake.input.hostreads) as f_ids:
58-
ids = {k.strip(): 1 for k in f_ids.readlines()}
58+
) as f_out:
5959
for header_str, seq_str, plus_str, quality_str in parse_fastq(f_in):
6060
parsed_header = (
6161
header_str.split(" ")[0].replace("/1", "").replace("/2", "")
6262
)
63-
if not parsed_header in ids:
63+
if not parsed_header in net_hostlist:
6464
write_fastq([header_str, seq_str, plus_str, quality_str], f_out)
6565

6666
# Check that the output file is about the right size given the number of ids removed
+1-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
from sunbeamlib.qc import filter_ids, remove_pair_id
22

33
with open(snakemake.log[0], "w") as log:
4-
ids = []
54
with open(snakemake.input.ids) as f:
6-
ids = [remove_pair_id(id, log) for id in f.readlines()]
5+
ids = set(remove_pair_id(id, log) for id in f.readlines())
76
log.write(f"Num Komplexity IDs to be filtered: {len(ids)}\n")
87

98
filter_ids(snakemake.input.reads, snakemake.output[0], ids, log)

0 commit comments

Comments
 (0)