Skip to content

Commit f4ccc0b

Browse files
author
rhliang
committed
Some changes to allow a sensible sorting of allele pairs.
1 parent 1dd6d93 commit f4ccc0b

File tree

4 files changed

+261
-30
lines changed

4 files changed

+261
-30
lines changed

src/easyhla/models.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
HLA_LOCUS,
1111
HLARawStandard,
1212
allele_coordinates,
13+
allele_coordinates_sort_key,
1314
bin2nuc,
1415
count_forgiving_mismatches,
1516
nuc2bin,
@@ -120,16 +121,16 @@ class HLAProteinPair(BaseModel):
120121

121122
def __lt__(self, other: "HLAProteinPair") -> bool:
122123
me_tuple: tuple[int, int, int, int] = (
123-
self.first_field_1,
124-
self.first_field_2,
125-
self.second_field_1,
126-
self.second_field_2,
124+
int(self.first_field_1),
125+
int(self.first_field_2),
126+
int(self.second_field_1),
127+
int(self.second_field_2),
127128
)
128129
other_tuple: tuple[int, int, int, int] = (
129-
other.first_field_1,
130-
other.first_field_2,
131-
other.second_field_1,
132-
other.second_field_2,
130+
int(other.first_field_1),
131+
int(other.first_field_2),
132+
int(other.second_field_1),
133+
int(other.second_field_2),
133134
)
134135
return me_tuple < other_tuple
135136

@@ -383,7 +384,21 @@ def best_common_allele_pair_str(
383384
clean_allele_pair_str: str = " - ".join(clean_allele)
384385
return (clean_allele_pair_str, set(unambiguous_aps.allele_pairs))
385386

386-
def stringify(self, max_length: int = 3900) -> str:
387+
def sort_pairs(self) -> list[tuple[str, str]]:
388+
"""
389+
Sort the pairs according to "coordinate order".
390+
391+
If there's a tie, a last letter is used to attempt to break the tie.
392+
"""
393+
return sorted(
394+
self.allele_pairs,
395+
key=lambda pair: (
396+
allele_coordinates_sort_key(pair[0]),
397+
allele_coordinates_sort_key(pair[1]),
398+
),
399+
)
400+
401+
def stringify(self, sorted=True, max_length: int = 3900) -> str:
387402
"""
388403
Produce a final outputtable string.
389404
@@ -393,7 +408,10 @@ def stringify(self, max_length: int = 3900) -> str:
393408
:return: ...
394409
:rtype: str
395410
"""
396-
summary_str: str = ";".join([f"{_a[0]} - {_a[1]}" for _a in self.allele_pairs])
411+
allele_pairs: list[tuple[str, str]] = self.allele_pairs
412+
if sorted:
413+
allele_pairs = self.sort_pairs()
414+
summary_str: str = ";".join([f"{_a[0]} - {_a[1]}" for _a in allele_pairs])
397415
if len(summary_str) > max_length:
398416
summary_str = re.sub(
399417
r";[^;]+$",

src/easyhla/utils.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ def allele_coordinates(
342342
Convert an allele string into a list of coordinates.
343343
344344
For example, allele "A*01:23:45N" gets converted to
345-
["A*01", "23", "45N"] or ["01", "23", "45] depending on the value of
345+
["A*01", "23", "45N"] or ["01", "23", "45"] depending on the value of
346346
digits_only.
347347
"""
348348
clean_allele_str: str = allele
@@ -351,14 +351,21 @@ def allele_coordinates(
351351
return clean_allele_str.strip().split(":")
352352

353353

354-
def allele_integer_coordinates(allele: str) -> tuple[int, ...]:
354+
def allele_coordinates_sort_key(allele: str) -> tuple[tuple[int, ...], str]:
355355
"""
356-
Convert an allele string into a list of integer coordinates.
356+
Produce a sortable key for an allele coordinate string.
357357
358-
For example, allele "A*01:23:45N" gets converted to
359-
(1, 23, 45)
358+
For example, allele "A*01:23:45N" gets converted to ((1, 23, 45), "N"). If
359+
there's no trailing letter, it becomes ((1, 23, 45), "").
360360
"""
361-
return tuple(int(coord) for coord in allele_coordinates(allele, True))
361+
integer_part: tuple[int, ...] = tuple(
362+
int(coord) for coord in allele_coordinates(allele, True)
363+
)
364+
letters_at_end_match: Optional[re.Match] = re.match(r".*\d+([a-zA-Z]+)$", allele)
365+
letters_at_end: str = ""
366+
if letters_at_end_match is not None:
367+
letters_at_end = letters_at_end_match.group(1)
368+
return (integer_part, letters_at_end)
362369

363370

364371
class HLARawStandard(BaseModel):
@@ -430,7 +437,7 @@ def collate_standards(
430437
)
431438

432439
for locus in ("A", "B", "C"):
433-
standards[locus].sort(key=lambda x: allele_integer_coordinates(x.allele))
440+
standards[locus].sort(key=lambda x: allele_coordinates_sort_key(x.allele))
434441

435442
return standards
436443

tests/models_test.py

Lines changed: 172 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,26 @@ class TestHLAProteinPair:
198198
("15", "84", "89", "92"),
199199
("15", "91", "91", "01"),
200200
),
201+
pytest.param(
202+
("15", "84", "89", "92"),
203+
("110", "91", "91", "01"),
204+
id="three_digits_sorted_by_number_and_not_alphabetically_first_first",
205+
),
206+
pytest.param(
207+
("15", "84", "89", "92"),
208+
("15", "111", "91", "01"),
209+
id="three_digits_sorted_by_number_and_not_alphabetically_first_second",
210+
),
211+
pytest.param(
212+
("15", "84", "89", "92"),
213+
("15", "84", "101", "01"),
214+
id="three_digits_sorted_by_number_and_not_alphabetically_second_first",
215+
),
216+
pytest.param(
217+
("15", "84", "89", "92"),
218+
("15", "84", "89", "100"),
219+
id="three_digits_sorted_by_number_and_not_alphabetically_second_second",
220+
),
201221
],
202222
)
203223
def test_strictly_less_than(
@@ -1109,9 +1129,95 @@ def test_get_allele_pairs(
11091129
assert result_alleles.allele_pairs == exp_alleles
11101130

11111131
@pytest.mark.parametrize(
1112-
"raw_allele_pairs, max_length, exp_stringification",
1132+
"raw_allele_pairs, exp_result",
11131133
[
1114-
(
1134+
pytest.param(
1135+
[],
1136+
[],
1137+
id="empty_list",
1138+
),
1139+
pytest.param(
1140+
[
1141+
("A*11:01:01G", "A*26:01:01G"),
1142+
],
1143+
[
1144+
("A*11:01:01G", "A*26:01:01G"),
1145+
],
1146+
id="single_element",
1147+
),
1148+
pytest.param(
1149+
[
1150+
("A*11:01:01", "A*26:01:01"),
1151+
("A*12:01:01", "A*26:01:01"),
1152+
],
1153+
[
1154+
("A*11:01:01", "A*26:01:01"),
1155+
("A*12:01:01", "A*26:01:01"),
1156+
],
1157+
id="two_elements_trivial_sort",
1158+
),
1159+
pytest.param(
1160+
[
1161+
("A*12:01:01", "A*26:01:01"),
1162+
("A*11:01:01", "A*26:01:01"),
1163+
],
1164+
[
1165+
("A*11:01:01", "A*26:01:01"),
1166+
("A*12:01:01", "A*26:01:01"),
1167+
],
1168+
id="two_elements_nontrivial_sort",
1169+
),
1170+
pytest.param(
1171+
[
1172+
("A*11:01:01G", "A*25:01:01"),
1173+
("A*11:01:01", "A*26:01:01"),
1174+
],
1175+
[
1176+
("A*11:01:01", "A*26:01:01"),
1177+
("A*11:01:01G", "A*25:01:01"),
1178+
],
1179+
id="two_elements_letter_vs_no_letter",
1180+
),
1181+
pytest.param(
1182+
[
1183+
("A*11:01:01N", "A*25:01:01"),
1184+
("A*11:01:01G", "A*26:01:01"),
1185+
],
1186+
[
1187+
("A*11:01:01G", "A*26:01:01"),
1188+
("A*11:01:01N", "A*25:01:01"),
1189+
],
1190+
id="two_elements_letter_tiebreak",
1191+
),
1192+
pytest.param(
1193+
[
1194+
("A*11:01:01G", "A*26:01:01N"),
1195+
("A*11:01:01G", "A*26:01:01G"),
1196+
("A*11:01:07", "A*26:01:17"),
1197+
("A*11:40", "A*66:01G"),
1198+
],
1199+
[
1200+
("A*11:01:01G", "A*26:01:01G"),
1201+
("A*11:01:01G", "A*26:01:01N"),
1202+
("A*11:01:07", "A*26:01:17"),
1203+
("A*11:40", "A*66:01G"),
1204+
],
1205+
id="typical_case",
1206+
),
1207+
],
1208+
)
1209+
def test_sort_pairs(
1210+
self,
1211+
raw_allele_pairs: list[tuple[str, str]],
1212+
exp_result: list[tuple[str, str]],
1213+
):
1214+
ap: AllelePairs = AllelePairs(allele_pairs=raw_allele_pairs)
1215+
assert ap.sort_pairs() == exp_result
1216+
1217+
@pytest.mark.parametrize(
1218+
"raw_allele_pairs, sorted, max_length, exp_stringification",
1219+
[
1220+
pytest.param(
11151221
[
11161222
("A*02:01:01G", "A*03:01:01G"),
11171223
("A*02:01:52", "A*03:01:03"),
@@ -1128,6 +1234,7 @@ def test_get_allele_pairs(
11281234
("A*02:86", "A*03:123"),
11291235
("A*02:20:01", "A*03:157"),
11301236
],
1237+
False,
11311238
3900,
11321239
# NOTE: This is one string concatenated together
11331240
(
@@ -1146,57 +1253,116 @@ def test_get_allele_pairs(
11461253
"A*02:86 - A*03:123;"
11471254
"A*02:20:01 - A*03:157"
11481255
),
1256+
id="typical_case_no_truncation",
11491257
),
1150-
(
1258+
pytest.param(
11511259
[
11521260
("A*11:01:01G", "A*26:01:01G"),
11531261
("A*11:01:07", "A*26:01:17"),
11541262
("A*11:19", "A*26:13"),
11551263
("A*11:40", "A*66:01G"),
11561264
],
1265+
False,
11571266
3900,
11581267
(
11591268
"A*11:01:01G - A*26:01:01G;"
11601269
"A*11:01:07 - A*26:01:17;"
11611270
"A*11:19 - A*26:13;"
11621271
"A*11:40 - A*66:01G"
11631272
),
1273+
id="no_truncation_no_sorting",
11641274
),
1165-
(
1275+
pytest.param(
11661276
[
11671277
("A*11:01:01G", "A*26:01:01G"),
11681278
("A*11:01:07", "A*26:01:17"),
11691279
("A*11:19", "A*26:13"),
11701280
("A*11:40", "A*66:01G"),
11711281
],
1282+
True,
1283+
3900,
1284+
(
1285+
"A*11:01:01G - A*26:01:01G;"
1286+
"A*11:01:07 - A*26:01:17;"
1287+
"A*11:19 - A*26:13;"
1288+
"A*11:40 - A*66:01G"
1289+
),
1290+
id="no_truncation_trivial_sorting",
1291+
),
1292+
pytest.param(
1293+
[
1294+
("A*11:19", "A*26:13"),
1295+
("A*11:01:01G", "A*26:01:01G"),
1296+
("A*11:40", "A*66:01G"),
1297+
("A*11:01:07", "A*26:01:17"),
1298+
],
1299+
True,
1300+
3900,
1301+
(
1302+
"A*11:01:01G - A*26:01:01G;"
1303+
"A*11:01:07 - A*26:01:17;"
1304+
"A*11:19 - A*26:13;"
1305+
"A*11:40 - A*66:01G"
1306+
),
1307+
id="no_truncation_meaningful_sorting",
1308+
),
1309+
pytest.param(
1310+
[
1311+
("A*11:01:01G", "A*26:01:01N"),
1312+
("A*11:01:01G", "A*26:01:01G"),
1313+
("A*11:01:07", "A*26:01:17"),
1314+
("A*11:40", "A*66:01G"),
1315+
],
1316+
True,
1317+
3900,
1318+
(
1319+
"A*11:01:01G - A*26:01:01G;"
1320+
"A*11:01:01G - A*26:01:01N;"
1321+
"A*11:01:07 - A*26:01:17;"
1322+
"A*11:40 - A*66:01G"
1323+
),
1324+
id="no_truncation_sorting_with_letter_tiebreak",
1325+
),
1326+
pytest.param(
1327+
[
1328+
("A*11:01:01G", "A*26:01:01G"),
1329+
("A*11:01:07", "A*26:01:17"),
1330+
("A*11:19", "A*26:13"),
1331+
("A*11:40", "A*66:01G"),
1332+
],
1333+
False,
11721334
60,
11731335
(
11741336
"A*11:01:01G - A*26:01:01G;"
11751337
"A*11:01:07 - A*26:01:17;"
11761338
"A*11:19 - A*26:13;"
11771339
"...TRUNCATED"
11781340
),
1341+
id="with_truncation",
11791342
),
1180-
(
1343+
pytest.param(
11811344
[
11821345
("A*11:01:01G", "A*26:01:01G"),
11831346
("A*11:01:07", "A*26:01:17"),
11841347
("A*11:19", "A*26:13"),
11851348
("A*11:40", "A*66:01G"),
11861349
],
1350+
False,
11871351
25,
11881352
"A*11:01:01G - A*26:01:01G;...TRUNCATED",
1353+
id="with_strong_truncation",
11891354
),
11901355
],
11911356
)
11921357
def test_stringify(
11931358
self,
11941359
raw_allele_pairs: list[tuple[str, str]],
1360+
sorted: bool,
11951361
max_length: int,
11961362
exp_stringification: str,
11971363
):
11981364
ap: AllelePairs = AllelePairs(allele_pairs=raw_allele_pairs)
1199-
assert ap.stringify(max_length) == exp_stringification
1365+
assert ap.stringify(sorted, max_length) == exp_stringification
12001366

12011367
@pytest.mark.parametrize(
12021368
"raw_allele_pairs, allele_name, expected_result",

0 commit comments

Comments
 (0)