Cleaned up all mypy errors.

rhliang · rhliang · commit 2896c8dae38c · 2025-08-06T16:38:08.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,8 @@ dependencies = [
   "pyyaml>=6.0.2",
   "requests>=2.32.3",
   "typer>=0.15.2",
+  "types-pyyaml>=6.0.12.20250516",
+  "types-requests>=2.32.4.20250611",
 ]
 
 [dependency-groups]
diff --git a/src/easyhla/bblab.py b/src/easyhla/bblab.py
@@ -6,7 +6,7 @@
 from typing import Any, Optional
 
 import typer
-from Bio.Seq import Seq
+from Bio.Seq import MutableSeq, Seq
 from Bio.SeqIO import parse
 
 from .bblab_lib import (
@@ -51,7 +51,7 @@ def log_and_print(
 
 
 def report_unmatched_sequences(
-    unmatched: dict[EXON_NAME, dict[str, Seq]],
+    unmatched: dict[EXON_NAME, dict[str, Seq | MutableSeq | None]],
     to_stdout: bool = False,
 ) -> None:
     """
@@ -97,7 +97,7 @@ def process_from_file_to_files(
     )
 
     matched_sequences: list[HLASequence]
-    unmatched: dict[EXON_NAME, dict[str, Seq]]
+    unmatched: dict[EXON_NAME, dict[str, Seq | MutableSeq | None]]
 
     with open(filename, "r", encoding="utf-8") as f:
         matched_sequences, unmatched = pair_exons(
diff --git a/src/easyhla/bblab_lib.py b/src/easyhla/bblab_lib.py
@@ -3,7 +3,7 @@
 from typing import TypedDict
 
 import numpy as np
-from Bio.Seq import Seq
+from Bio.Seq import MutableSeq, Seq
 from Bio.SeqIO import SeqRecord
 from pydantic import BaseModel
 
@@ -36,7 +36,7 @@
 
 def pair_exons_helper(
     sequence_record: SeqRecord,
-    unmatched: dict[EXON_NAME, dict[str, Seq]],
+    unmatched: dict[EXON_NAME, dict[str, Seq | MutableSeq | None]],
 ) -> tuple[str, bool, bool, str, str]:
     """
     Helper that attempts to match the given sequence with a "partner" exon.
@@ -55,7 +55,7 @@ def pair_exons_helper(
     - exon3 sequence
     """
     # The `id`` field is expected to hold the sample name.
-    samp: str = sequence_record.id
+    samp: str = sequence_record.id or ""
     is_exon: bool = False
     matched: bool = False
     exon2: str = ""
@@ -98,7 +98,7 @@ def pair_exons(
     sequence_records: Iterable[SeqRecord],
     locus: HLA_LOCUS,
     example_standard: HLAStandard,
-) -> tuple[list[HLASequence], dict[EXON_NAME, dict[str, Seq]]]:
+) -> tuple[list[HLASequence], dict[EXON_NAME, dict[str, Seq | MutableSeq | None]]]:
     """
     Pair exons in the given input sequences.
 
@@ -109,7 +109,7 @@ def pair_exons(
     sequences and attempt to match them up.
     """
     matched_sequences: list[HLASequence] = []
-    unmatched: dict[EXON_NAME, dict[str, Seq]] = {
+    unmatched: dict[EXON_NAME, dict[str, Seq | MutableSeq | None]] = {
         "exon2": {},
         "exon3": {},
     }
@@ -118,7 +118,7 @@ def pair_exons(
         # Skip over any sequences that aren't the right length or contain
         # bad bases.
         try:
-            check_length(locus, str(sr.seq), sr.id)
+            check_length(locus, str(sr.seq), sr.id or "")
         except BadLengthException:
             continue
 
@@ -147,21 +147,21 @@ def pair_exons(
             exon3_bin = pad_short(example_standard.sequence, nuc2bin(exon3), "exon3")
             matched_sequences.append(
                 HLASequence(
-                    two=(int(x) for x in exon2_bin),
+                    two=tuple(int(x) for x in exon2_bin),
                     intron=(),
-                    three=(int(x) for x in exon3_bin),
+                    three=tuple(int(x) for x in exon3_bin),
                     name=identifier,
                     locus=locus,
                     num_sequences_used=2,
                 )
             )
         else:
-            seq_numpy: np.array = pad_short(
+            seq_numpy: np.ndarray = pad_short(
                 example_standard.sequence,
                 nuc2bin(sr.seq),  # type: ignore
                 None,
             )
-            seq: tuple[int] = tuple(int(x) for x in seq_numpy)
+            seq: tuple[int, ...] = tuple(int(x) for x in seq_numpy)
             matched_sequences.append(
                 HLASequence(
                     two=seq[:EXON2_LENGTH],
diff --git a/src/easyhla/clinical_hla_lib.py b/src/easyhla/clinical_hla_lib.py
@@ -348,7 +348,7 @@ def identify_bc_sequence_files(
         if sample_match is None:
             logger.info(f'Skipping file "{filename}".')
             continue
-        sample_name: str = sample_match.group(1)
+        sample_name = sample_match.group(1)
         sample_exon: EXON_NAME = (
             "exon2" if sample_match.group(2).upper() == "A" else "exon3"
         )
diff --git a/src/easyhla/easyhla.py b/src/easyhla/easyhla.py
@@ -4,7 +4,7 @@
 from datetime import datetime
 from io import TextIOBase
 from operator import attrgetter
-from typing import Final, Optional, TypedDict
+from typing import Final, Optional, TypedDict, cast
 
 import numpy as np
 import yaml
@@ -127,7 +127,7 @@ def read_hla_standards(standards_io: TextIOBase) -> LoadedStandards:
         }
 
     @staticmethod
-    def load_default_hla_standards() -> dict[str, HLAStandard]:
+    def load_default_hla_standards() -> LoadedStandards:
         """
         Load HLA Standards from reference file.
 
@@ -258,7 +258,7 @@ def combine_standards_stepper(
         # Keep track of matches we've already found:
         combos: dict[tuple[int, ...], int] = {}
 
-        current_rejection_threshold: int = float("inf")
+        current_rejection_threshold: int | float = float("inf")
         for std_ai, std_a in enumerate(matching_stds):
             if std_a.mismatch > current_rejection_threshold:
                 continue
@@ -269,8 +269,8 @@ def combine_standards_stepper(
                 # "Mush" the two standards together to produce something
                 # that looks like what you get when you sequence HLA.
                 std_bin = np.array(std_b.sequence) | np.array(std_a.sequence)
-                allele_pair: tuple[str, str] = tuple(
-                    sorted((std_a.allele, std_b.allele))
+                allele_pair: tuple[str, str] = cast(
+                    tuple[str, str], tuple(sorted((std_a.allele, std_b.allele)))
                 )
 
                 # There could be more than one combined standard with the
@@ -284,7 +284,7 @@ def combine_standards_stepper(
                 else:
                     seq_mask = np.full_like(std_bin, fill_value=15)
                     # Note that seq is implicitly cast to a NumPy array:
-                    mismatches: int = np.count_nonzero((std_bin ^ seq) & seq_mask != 0)
+                    mismatches = np.count_nonzero((std_bin ^ seq) & seq_mask != 0)
                     combos[combined_std_bin] = mismatches  # cache this value
 
                 if mismatches > current_rejection_threshold:
@@ -330,7 +330,7 @@ def combine_standards(
 
         combos: dict[tuple[int, ...], tuple[int, list[tuple[str, str]]]] = {}
 
-        fewest_mismatches: int = float("inf")
+        fewest_mismatches: int | float = float("inf")
         for (
             combined_std_bin,
             mismatches,
@@ -346,7 +346,7 @@ def combine_standards(
         # criteria.
         result: dict[HLACombinedStandard, int] = {}
 
-        cutoff: int = max(fewest_mismatches, mismatch_threshold)
+        cutoff: int | float = max(fewest_mismatches, mismatch_threshold)
         for combined_std_bin, mismatch_count_and_pair_list in combos.items():
             mismatch_count: int
             pair_list: list[tuple[str, str]]
diff --git a/src/easyhla/interpret_from_json_lib.py b/src/easyhla/interpret_from_json_lib.py
@@ -68,7 +68,7 @@ def hla_sequence(self) -> HLASequence:
             exon3_str = self.seq1[-276:]
         else:
             exon2_str = self.seq1
-            exon3_str = self.seq2
+            exon3_str = self.seq2 or ""
 
         num_sequences_used: int = 1 if self.locus == "A" else 2
         return HLASequence(
diff --git a/src/easyhla/update_alleles.py b/src/easyhla/update_alleles.py
@@ -8,7 +8,7 @@
 import time
 from datetime import datetime
 from io import StringIO
-from typing import Final, Optional, TypedDict
+from typing import Final, Optional, TypedDict, cast
 
 import Bio
 import requests
@@ -169,7 +169,7 @@ def get_commit_hash(
     return None
 
 
-def get_from_git(tag: str) -> tuple[str, datetime, str]:
+def get_from_git(tag: str) -> tuple[str, datetime, Optional[str]]:
     alleles_str: str
     retrieval_datetime: datetime
     for i in range(5):
@@ -185,7 +185,7 @@ def get_from_git(tag: str) -> tuple[str, datetime, str]:
         else:
             break
 
-    commit_hash: str
+    commit_hash: Optional[str]
     for i in range(5):
         try:
             commit_hash = get_commit_hash(tag)
@@ -271,7 +271,7 @@ def main():
     logger.info(f"Retrieving alleles from tag {args.tag}....")
     alleles_str: str
     retrieval_datetime: datetime
-    commit_hash: str
+    commit_hash: Optional[str]
     alleles_str, retrieval_datetime, commit_hash = get_from_git(args.tag)
     logger.info(
         f"Alleles (version {args.tag}, commit hash {commit_hash}) retrieved at "
@@ -301,10 +301,12 @@ def main():
     logger.info("Identifying identical HLA alleles....")
     standards_for_saving: StoredHLAStandards = StoredHLAStandards(
         tag=args.tag,
-        commit_hash=commit_hash,
+        commit_hash=commit_hash or "",
         last_updated=retrieval_datetime,
         standards={
-            locus: group_identical_alleles(raw_standards[locus])
+            cast(HLA_LOCUS, locus): group_identical_alleles(
+                raw_standards[cast(HLA_LOCUS, locus)]
+            )
             for locus in ("A", "B", "C")
         },
     )
diff --git a/src/easyhla/utils.py b/src/easyhla/utils.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from collections.abc import Iterable, Sequence
 from datetime import datetime
-from typing import Final, Literal, Optional, Self
+from typing import Final, Literal, Optional, Self, cast
 
 import numpy as np
 from Bio.SeqIO import SeqRecord
@@ -272,21 +272,21 @@ def pad_short(
     exon3_std_bin: np.ndarray = np.array(std_bin[-EXON3_LENGTH:])
     if exon == "exon2":
         left_pad, right_pad = calc_padding(
-            exon2_std_bin,
+            cast(Sequence[int], exon2_std_bin),
             seq_bin,
         )
     elif exon == "exon3":
         left_pad, right_pad = calc_padding(
-            exon3_std_bin,
+            cast(Sequence[int], exon3_std_bin),
             seq_bin,
         )
     else:  # i.e. this is a full sequence possibly with intron
         left_pad, _ = calc_padding(
-            exon2_std_bin,
+            cast(Sequence[int], exon2_std_bin),
             seq_bin[: int(EXON2_LENGTH / 2)],
         )
         _, right_pad = calc_padding(
-            exon3_std_bin,
+            cast(Sequence[int], exon3_std_bin),
             seq_bin[-int(EXON3_LENGTH / 2) :],
         )
     return np.concatenate(
@@ -300,7 +300,7 @@ def pad_short(
 
 def get_acceptable_match(
     sequence: str, reference: str, mismatch_threshold: int = 20
-) -> tuple[int, Optional[str]]:
+) -> tuple[int, str]:
     """
     Get an "acceptable match" between the sequence and reference.
 
@@ -316,7 +316,7 @@ def get_acceptable_match(
         raise ValueError("sequence must be at least as long as the reference")
 
     score: int = len(reference)
-    best_match: Optional[str] = None
+    best_match: str = sequence[0 : len(reference)]
 
     ref_np: np.ndarray = np.array(list(reference))
     for shift in range(len(sequence) - len(reference) + 1):
@@ -389,32 +389,36 @@ def collate_standards(
     checked to see if it has acceptable matches for both exon2 and exon3.
     """
     output_status_updates: bool = False
+    actual_report_interval: int = 1000
+    actual_logger: logging.Logger
     if logger is not None and report_interval is not None and report_interval > 0:
         output_status_updates = True
+        actual_report_interval = cast(int, report_interval)
+        actual_logger = cast(logging.Logger, logger)
 
     standards: dict[HLA_LOCUS, list[HLARawStandard]] = {
         "A": [],
         "B": [],
         "C": [],
     }
     for idx, allele_sr in enumerate(allele_srs, start=1):
-        if output_status_updates and idx % report_interval == 0:
-            logger.info(f"Processing sequence {idx} of {len(allele_srs)}....")
+        if output_status_updates and idx % actual_report_interval == 0:
+            actual_logger.info(f"Processing sequence {idx} of {len(allele_srs)}....")
 
         # The FASTA headers look like:
         # >HLA:HLA00001 A*01:01:01:01 1098 bp
         allele_name: str = allele_sr.description.split(" ")[1]
-        locus: HLA_LOCUS = allele_name[0]
-
-        if locus not in ("A", "B", "C"):
+        raw_locus: str = allele_name[0]
+        if raw_locus not in ("A", "B", "C"):
             continue
+        locus: HLA_LOCUS = cast(HLA_LOCUS, raw_locus)
 
-        exon2_match: tuple[int, Optional[str]] = get_acceptable_match(
+        exon2_match: tuple[int, str] = get_acceptable_match(
             str(allele_sr.seq),
             exon_references[locus]["exon2"],
             mismatch_threshold=acceptable_match_search_threshold,
         )
-        exon3_match: tuple[int, Optional[str]] = get_acceptable_match(
+        exon3_match: tuple[int, str] = get_acceptable_match(
             str(allele_sr.seq),
             exon_references[locus]["exon3"],
             mismatch_threshold=acceptable_match_search_threshold,
@@ -431,7 +435,7 @@ def collate_standards(
                 )
             )
         elif logger is not None:
-            logger.info(
+            actual_logger.info(
                 f'Rejecting "{allele_name}": {exon2_match[0]} exon2 mismatches,'
                 f" {exon3_match[0]} exon3 mismatches."
             )
@@ -447,7 +451,10 @@ class GroupedAllele(BaseModel):
     exon3: str
     alleles: list[str]
 
-    @computed_field
+    # Due to this issue:
+    # https://github.com/python/mypy/issues/1362
+    # we need the special mypy instruction here.
+    @computed_field  # type: ignore[misc]
     @property
     def name(self) -> str:
         """
diff --git a/tests/bblab_lib_test.py b/tests/bblab_lib_test.py
@@ -1,5 +1,5 @@
 import pytest
-from Bio.Seq import Seq
+from Bio.Seq import MutableSeq, Seq
 from Bio.SeqIO import SeqRecord
 
 from easyhla.bblab_lib import (
@@ -352,7 +352,7 @@
 )
 def test_pair_exons_helper(
     sr: SeqRecord,
-    unmatched: dict[EXON_NAME, dict[str, Seq]],
+    unmatched: dict[EXON_NAME, dict[str, Seq | MutableSeq | None]],
     expected_id: str,
     expected_is_exon: bool,
     expected_matched: bool,
@@ -730,7 +730,7 @@ def test_pair_exons(
     expected_unmatched: dict[EXON_NAME, dict[str, Seq]],
 ):
     paired_seqs: list[HLASequence]
-    unmatched: dict[EXON_NAME, dict[str, Seq]]
+    unmatched: dict[EXON_NAME, dict[str, Seq | MutableSeq | None]]
 
     current_standard: HLARawStandard = HLA_STANDARDS[locus]
     fake_standard: HLAStandard = HLAStandard(
diff --git a/tests/easyhla_test.py b/tests/easyhla_test.py
diff --git a/tests/models_test.py b/tests/models_test.py
diff --git a/tests/utils_test.py b/tests/utils_test.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,8 @@ dependencies = [`
`33`	`33`	`"pyyaml>=6.0.2",`
`34`	`34`	`"requests>=2.32.3",`
`35`	`35`	`"typer>=0.15.2",`
	`36`	`+ "types-pyyaml>=6.0.12.20250516",`
	`37`	`+ "types-requests>=2.32.4.20250611",`
`36`	`38`	`]`
`37`	`39`
`38`	`40`	`[dependency-groups]`
Original file line number	Diff line number	Diff line change
`@@ -348,7 +348,7 @@ def identify_bc_sequence_files(`
`348`	`348`	`if sample_match is None:`
`349`	`349`	`logger.info(f'Skipping file "{filename}".')`
`350`	`350`	`continue`
`351`		`- sample_name: str = sample_match.group(1)`
	`351`	`+ sample_name = sample_match.group(1)`
`352`	`352`	`sample_exon: EXON_NAME = (`
`353`	`353`	`"exon2" if sample_match.group(2).upper() == "A" else "exon3"`
`354`	`354`	`)`