Skip to content

Commit 8254fc4

Browse files
author
rhliang
committed
WIP: examining the conversion from old frequency file format to new.
Also some untested work on converting EasyHLA's read method for this data.
1 parent a029519 commit 8254fc4

File tree

3 files changed

+98
-18
lines changed

3 files changed

+98
-18
lines changed

src/easyhla/easyhla.py

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import csv
12
import os
23
from collections.abc import Iterable, Sequence
34
from datetime import datetime
@@ -161,20 +162,45 @@ def read_hla_frequencies(
161162
"B": {},
162163
"C": {},
163164
}
164-
for line in frequencies_io.readlines():
165-
for locus in ("A", "B", "C"):
166-
column_id = EasyHLA.COLUMN_IDS[locus]
167-
line_array = line.strip().split(",")[column_id : column_id + 2]
168-
169-
protein_pair: HLAProteinPair = HLAProteinPair(
170-
first_field_1=line_array[0][0:2],
171-
first_field_2=line_array[0][2:4],
172-
second_field_1=line_array[1][0:2],
173-
second_field_2=line_array[1][2:4],
174-
)
175-
if hla_freqs[locus].get(protein_pair, None) is None:
176-
hla_freqs[locus][protein_pair] = 0
177-
hla_freqs[locus][protein_pair] += 1
165+
166+
locus_columns: dict[HLA_LOCUS, tuple[str, str]] = {
167+
"A": ("a_first", "a_second"),
168+
"B": ("b_first", "b_second"),
169+
"C": ("c_first", "c_second"),
170+
}
171+
with frequencies_io:
172+
frequencies_csv: csv.DictReader = csv.DictReader(frequencies_io)
173+
174+
for row in frequencies_csv:
175+
for locus in ("A", "B", "C"):
176+
curr_col1: str
177+
curr_col2: str
178+
curr_col1, curr_col2 = locus_columns[locus]
179+
raw_allele_1: str = row[curr_col1]
180+
raw_allele_2: str = row[curr_col2]
181+
182+
if raw_allele_1 in ("unknown", "deprecated"):
183+
continue
184+
if raw_allele_2 in ("unknown", "deprecated"):
185+
continue
186+
187+
first_field_1: str
188+
first_field_2: str
189+
second_field_1: str
190+
second_field_2: str
191+
192+
first_field_1, first_field_2 = raw_allele_1.split(":")
193+
second_field_1, second_field_2 = raw_allele_2.split(":")
194+
195+
protein_pair: HLAProteinPair = HLAProteinPair(
196+
first_field_1=first_field_1,
197+
first_field_2=first_field_2,
198+
second_field_1=second_field_1,
199+
second_field_2=second_field_2,
200+
)
201+
if hla_freqs[locus].get(protein_pair, None) is None:
202+
hla_freqs[locus][protein_pair] = 0
203+
hla_freqs[locus][protein_pair] += 1
178204
return hla_freqs
179205

180206
@staticmethod
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
old.frequencies <- read.csv(
2+
"src/easyhla/default_data/hla_frequencies.csv",
3+
header=FALSE,
4+
col.names=c("a1", "a2", "b1", "b2", "c1", "c2"),
5+
colClasses="character",
6+
)
7+
8+
a1.7401 <- which(old.frequencies$a1 == "7401")
9+
a2.7401 <- which(old.frequencies$a2 == "7401")
10+
a1.7403 <- which(old.frequencies$a1 == "7403")
11+
a2.7403 <- which(old.frequencies$a2 == "7403")
12+
13+
a1.74xx <- which(startsWith(old.frequencies$a1, "74") & old.frequencies$a1 != "7400")
14+
a2.74xx <- which(startsWith(old.frequencies$a2, "74") & old.frequencies$a2 != "7400")
15+
16+
old.frequencies[sort(union(a1.74xx, a2.74xx)),]
17+
18+
c1.17xx <- which(startsWith(old.frequencies$c1, "17") & old.frequencies$c1 != "1700")
19+
c2.17xx <- which(startsWith(old.frequencies$c2, "17") & old.frequencies$c2 != "1700")
20+
c1.18xx <- which(startsWith(old.frequencies$c1, "18") & old.frequencies$c1 != "1800")
21+
c2.18xx <- which(startsWith(old.frequencies$c2, "18") & old.frequencies$c2 != "1800")
22+
23+
old.frequencies[sort(union(a1.74xx, a2.74xx)),]
24+
25+
old.frequencies[sort(union(c1.17xx, c2.17xx)),]
26+
old.frequencies[sort(union(c1.18xx, c2.18xx)),]

src/scripts/update_frequency_file.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import csv
55
import logging
66
import re
7+
from collections import Counter
78
from dataclasses import dataclass, fields
89
from io import TextIOBase
910
from typing import ClassVar, Final, Optional, Self, TypedDict
@@ -98,6 +99,8 @@ def parse_nomenclature(remapping_str: str) -> dict[OldName, NewName]:
9899
except OtherLocusException:
99100
continue
100101
new_name: NewName = NewName.from_string(new_name_str)
102+
if new_name.locus is None:
103+
logger.info(f"Allele {old_name_str} is deprecated.")
101104
if old_name in remapping:
102105
if new_name.locus is None:
103106
logger.info(
@@ -168,7 +171,10 @@ def update_old_frequencies(
168171
) -> list[FrequencyRow]:
169172
old_frequencies_csv: csv.reader = csv.reader(old_frequencies_file)
170173

171-
unmapped_alleles: set[tuple[HLA_LOCUS, str]] = set()
174+
# Report to the user any frequencies that are either unmapped or
175+
# deprecated.
176+
unmapped_alleles: Counter[tuple[HLA_LOCUS, str]] = Counter()
177+
deprecated_alleles_seen: Counter[tuple[HLA_LOCUS, str]] = Counter()
172178

173179
updated_frequencies: list[FrequencyRow] = []
174180
for row in old_frequencies_csv:
@@ -180,13 +186,35 @@ def update_old_frequencies(
180186
old_name: OldName = OldName.from_old_frequency_format(locus, row[idx])
181187
new_name: Optional[NewName] = old_to_new.get(old_name)
182188

183-
if new_name is None and (locus, row[idx]) not in unmapped_alleles:
184-
logger.info(f'No mapping found for HLA-{locus} allele "{row[idx]}".')
185-
unmapped_alleles.add((locus, row[idx]))
189+
if new_name is None:
190+
unmapped_alleles[(locus, row[idx])] += 1
191+
elif new_name.locus is None:
192+
deprecated_alleles_seen[(locus, row[idx])] += 1
186193

187194
updated.append(new_name)
188195

189196
updated_frequencies.append(FrequencyRow(*updated))
197+
198+
if len(unmapped_alleles) > 1:
199+
logger.info(
200+
"Alleles present in the old frequencies that do not have a mapping "
201+
"in the new nomenclature, and their numbers of occurrences:"
202+
)
203+
for locus, name in unmapped_alleles:
204+
logger.info(
205+
f'HLA-{locus} allele "{name}": {unmapped_alleles[(locus, name)]}'
206+
)
207+
208+
if len(deprecated_alleles_seen) > 1:
209+
logger.info(
210+
"Alleles present in the old frequencies that are deprecated "
211+
"in the new nomenclature, and their numbers of occurrences:"
212+
)
213+
for locus, name in deprecated_alleles_seen:
214+
logger.info(
215+
f'HLA-{locus} allele "{name}": {deprecated_alleles_seen[(locus, name)]}'
216+
)
217+
190218
return updated_frequencies
191219

192220

0 commit comments

Comments
 (0)