Skip to content

Commit 9bb6ee2

Browse files
author
rhliang
committed
WIP: cleaned up the update_frequency_file script and started adding tests.
Tests seem to be choking when I run the entire test suite; to get them to run I had to move some test files out of place!
1 parent b9a30ff commit 9bb6ee2

File tree

7 files changed

+663
-310
lines changed

7 files changed

+663
-310
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ clinical_hla = "easyhla.clinical_hla:main"
6363
interpret_from_json = "easyhla.interpret_from_json:main"
6464
bblab = "easyhla.bblab:main"
6565
update_alleles = "easyhla.update_alleles:main"
66+
update_frequency_file = "easyhla.update_frequency_file:main"
6667

6768
[project.optional-dependencies]
6869
database = [
@@ -116,6 +117,7 @@ omit = [
116117
"src/easyhla/clinical_hla.py",
117118
"src/easyhla/interpret_from_json.py",
118119
"src/easyhla/update_alleles.py",
120+
"src/easyhla/update_frequency_file.py",
119121
"src/scripts/*.py",
120122
]
121123

src/easyhla/models.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -133,46 +133,46 @@ def __lt__(self, other: "HLAProteinPair") -> bool:
133133
)
134134
return me_tuple < other_tuple
135135

136-
UNKNOWN: ClassVar[Final[str]] = "unknown"
136+
UNMAPPED: ClassVar[Final[str]] = "unmapped"
137137
DEPRECATED: ClassVar[Final[str]] = "deprecated"
138138

139139
class NonAlleleException(Exception):
140140
def __init__(
141141
self,
142-
first_unknown: bool = False,
142+
first_unmapped: bool = False,
143143
first_deprecated: bool = False,
144-
second_unknown: bool = False,
144+
second_unmapped: bool = False,
145145
second_deprecated: bool = False,
146146
):
147-
self.first_unknown = first_unknown
147+
self.first_unmapped = first_unmapped
148148
self.first_deprecated = first_deprecated
149-
self.second_unknown = second_unknown
149+
self.second_unmapped = second_unmapped
150150
self.second_deprecated = second_deprecated
151151

152152
@classmethod
153153
def from_frequency_entry(
154154
cls, raw_first_allele: str, raw_second_allele: str
155155
) -> Optional[Self]:
156-
first_unknown: bool = False
156+
first_unmapped: bool = False
157157
first_deprecated: bool = False
158-
second_unknown: bool = False
158+
second_unmapped: bool = False
159159
second_deprecated: bool = False
160160

161-
if raw_first_allele == HLAProteinPair.UNKNOWN:
162-
first_unknown = True
161+
if raw_first_allele == HLAProteinPair.UNMAPPED:
162+
first_unmapped = True
163163
elif raw_first_allele == HLAProteinPair.DEPRECATED:
164164
first_deprecated = True
165165

166-
if raw_second_allele == HLAProteinPair.UNKNOWN:
167-
second_unknown = True
166+
if raw_second_allele == HLAProteinPair.UNMAPPED:
167+
second_unmapped = True
168168
elif raw_second_allele == HLAProteinPair.DEPRECATED:
169169
second_deprecated = True
170170

171171
if any(
172-
(first_unknown, first_deprecated, second_unknown, second_deprecated)
172+
(first_unmapped, first_deprecated, second_unmapped, second_deprecated)
173173
):
174174
return cls(
175-
first_unknown, first_deprecated, second_unknown, second_deprecated
175+
first_unmapped, first_deprecated, second_unmapped, second_deprecated
176176
)
177177
return None
178178

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#! /usr/bin/env python
2+
3+
import argparse
4+
import csv
5+
import logging
6+
from collections import Counter
7+
8+
from .update_frequency_file_lib import (
9+
FREQUENCY_FIELDS,
10+
FrequencyRowDict,
11+
NewName,
12+
OldName,
13+
parse_nomenclature,
14+
update_old_frequencies,
15+
)
16+
from .utils import HLA_LOCUS
17+
18+
logging.basicConfig()
19+
logger: logging.Logger = logging.getLogger(__name__)
20+
logger.setLevel(logging.INFO)
21+
22+
23+
def main():
24+
parser: argparse.ArgumentParser = argparse.ArgumentParser(
25+
"Update the format of the old HLA frequencies file"
26+
)
27+
parser.add_argument(
28+
"old_frequency_file",
29+
help="Old frequency file path (6-column CSV without header)",
30+
type=argparse.FileType("r"),
31+
)
32+
parser.add_argument(
33+
"name_mapping",
34+
help=(
35+
"File mapping old allele names to new allele names in the format "
36+
"kept by IMGT/HLA"
37+
),
38+
type=argparse.FileType("r"),
39+
)
40+
parser.add_argument(
41+
"--output",
42+
help="Output CSV file",
43+
type=argparse.FileType("w"),
44+
default="hla_frequencies.csv",
45+
)
46+
args: argparse.Namespace = parser.parse_args()
47+
48+
logger.info("Reading in the update mapping....")
49+
with args.name_mapping:
50+
old_to_new: dict[OldName, NewName]
51+
deprecated: list[str]
52+
deprecated_maps_to_other: list[tuple[str, str]]
53+
mapping_overrides_deprecated: list[tuple[str, str]]
54+
(
55+
old_to_new,
56+
deprecated,
57+
deprecated_maps_to_other,
58+
mapping_overrides_deprecated,
59+
) = parse_nomenclature(args.name_mapping.read())
60+
61+
for old_name_str in deprecated:
62+
logger.info(f"Allele {old_name_str} is deprecated.")
63+
64+
for old_name_str, existing_mapping in deprecated_maps_to_other:
65+
logger.info(
66+
f"Allele {old_name_str} is deprecated but another allele "
67+
"with the same first two coordinates maps to "
68+
f"{existing_mapping}; retaining that mapping."
69+
)
70+
71+
for old_name_str, new_name_str in mapping_overrides_deprecated:
72+
logger.info(
73+
f"Allele {old_name_str} maps to {new_name_str} but another "
74+
f"allele with the same first two coordinates was "
75+
f"deprecated; updating the mapping."
76+
)
77+
78+
logger.info("Updating old alleles....")
79+
with args.old_frequency_file:
80+
updated_frequencies: list[FrequencyRowDict]
81+
unmapped_alleles: Counter[tuple[HLA_LOCUS, str]]
82+
deprecated_alleles_seen: Counter[tuple[HLA_LOCUS, str]]
83+
updated_frequencies, unmapped_alleles, deprecated_alleles_seen = (
84+
update_old_frequencies(args.old_frequency_file, old_to_new)
85+
)
86+
87+
if len(unmapped_alleles) > 1:
88+
logger.info(
89+
"Alleles present in the old frequencies that do not have a mapping "
90+
"in the new nomenclature, and their numbers of occurrences:"
91+
)
92+
for locus, name in unmapped_alleles:
93+
logger.info(
94+
f'HLA-{locus} allele "{name}": {unmapped_alleles[(locus, name)]}'
95+
)
96+
97+
if len(deprecated_alleles_seen) > 1:
98+
logger.info(
99+
"Alleles present in the old frequencies that are deprecated "
100+
"in the new nomenclature, and their numbers of occurrences:"
101+
)
102+
for locus, name in deprecated_alleles_seen:
103+
logger.info(
104+
f'HLA-{locus} allele "{name}": {deprecated_alleles_seen[(locus, name)]}'
105+
)
106+
107+
with args.output:
108+
frequencies_csv: csv.DictWriter = csv.DictWriter(args.output, FREQUENCY_FIELDS)
109+
frequencies_csv.writeheader()
110+
frequencies_csv.writerows(updated_frequencies)
111+
112+
logger.info("... done.")
113+
114+
115+
if __name__ == "__main__":
116+
main()

0 commit comments

Comments
 (0)