From 27c03490efe4f8fea844ebd37d3c146b44a574df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Wed, 15 Oct 2025 11:58:56 +0100 Subject: [PATCH 01/51] feat: Airflow task to generate chromosome-map.json --- .../load/impc_spa/impc_chromosome_mapper.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_chromosome_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_chromosome_mapper.py b/impc_etl/jobs/load/impc_spa/impc_chromosome_mapper.py new file mode 100644 index 00000000..efb58404 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_chromosome_mapper.py @@ -0,0 +1,36 @@ +import logging +import textwrap +from airflow.sdk import Variable, asset +from impc_etl.utils.airflow import create_input_asset, create_output_asset +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gene_summary_service_json_asset = create_input_asset("output/impc_web_api/gene_summary_service_json") +chromosome_map_json_asset = create_output_asset("impc_spa/chromosome-map.json") + +@asset.multi( + schedule=[gene_summary_service_json_asset], + outlets=[chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_chromosome_mapper", + description=textwrap.dedent( + """IMPC SPA chromosome mapper DAG.""" + ), + tags=["impc_spa", "chromosome map"], +) +@with_spark_session +def impc_chromosome_mapper(): + import json + from pyspark.sql import SparkSession + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + gene_summary_service_json_path = gene_summary_service_json_asset.uri + gene_summary_df = spark.read.json(gene_summary_service_json_path) + gene_summary_df = gene_summary_df.select("mgiGeneAccessionId", "chrName") + gene_list = map(lambda row: row.asDict(), gene_summary_df.collect()) + chromosome_map_dict = {gene["mgiGeneAccessionId"]: gene["chrName"] for gene in gene_list} + output_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + with open(output_path, "w") as output_file: + output_file.write(json.dumps(chromosome_map_dict)) \ No newline at end of file From fbcfe0fe7a9f8593a6376f1eea1c2bdea38a7889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Wed, 15 Oct 2025 16:30:27 +0100 Subject: [PATCH 02/51] feat: initial task to generate chromosome dirs and gene summaries --- .../impc_spa/impc_gene_summaries_mapper.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_summaries_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_summaries_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_summaries_mapper.py new file mode 100644 index 00000000..1fb00e23 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_summaries_mapper.py @@ -0,0 +1,55 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, Asset, Metadata, dag +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gene_summary_service_json_asset = create_input_asset("output/impc_web_api/gene_summary_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_summaries_asset = AssetAlias("impc_spa_gene_summaries") + +@dag( + schedule=[gene_summary_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_summaries_mapper", + description=textwrap.dedent( + """IMPC SPA gene summaries mapper DAG.""" + ), + tags=["impc_spa", "gene", "summary"], +) +def impc_gene_summaries_mapper(): + @with_spark_session + @task(outlets=[gene_summaries_asset]) + def process_gene_summaries(*, outlet_events): + import os + from pyspark.sql import SparkSession + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + gene_summary_service_json_path = gene_summary_service_json_asset.uri + gene_summary_df = spark.read.json(gene_summary_service_json_path) + gene_list = map(lambda row: row.asDict(), gene_summary_df.collect()) + output_path = f"{get_data_release_work_dir()}/output/impc_spa/" + for gene in gene_list: + chromosome = chromosome_map_json[gene["mgiGeneAccessionId"]] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_accession_id = gene["mgiGeneAccessionId"].replace(":", "_") + gene_dir_path = f"{chromosome_folder}/{gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + + + gene_summary_path = f"{gene_dir_path}/gene-summary.json" + with open(gene_summary_path, "w") as gene_file: + gene_file.write(json.dumps(gene)) + outlet_events[gene_summaries_asset].add(Asset(f"file://{gene_summary_path}")) + process_gene_summaries() +impc_gene_summaries_mapper() From 80cd700c4d37b8e6868851bae64b876cfa34bbbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 23 Oct 2025 13:53:54 +0100 Subject: [PATCH 03/51] feat: initial task to generate external-links files based on chromosome map file --- .../impc_gene_external_links_mapper.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py new file mode 100644 index 00000000..1e1ba86f --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py @@ -0,0 +1,89 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +external_links_json_asset = create_input_asset("output/impc_web_api/external_links_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_extenal_links_asset = AssetAlias("impc_spa_external_links") + +@dag( + schedule=[external_links_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_external_links_mapper", + description=textwrap.dedent( + """IMPC SPA external links mapper DAG.""" + ), + tags=["impc_spa", "external links"], +) +def impc_gene_external_links_mapper(): + @with_spark_session + @task + def process_gene_external_links(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + gene_summary_service_json_path = external_links_json_asset.uri + external_links_df = spark.read.json(gene_summary_service_json_path) + result_df = chromosome_map_df.join(external_links_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(result_df.href.isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/external_links_temp_json") + ) + print("Finished") + + @task(outlets=[gene_extenal_links_asset]) + def process_temp_folder(*, outlet_events): + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + def generate_valid_json_from_file(file_path): + file_data = open(file_path, 'r') + lines = file_data.readlines() + return f"[{','.join(lines)}]" + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/external_links_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + gene_external_links_path = f"{gene_dir_path}/external-links.json" + with open(gene_external_links_path, "w") as gene_file: + gene_file.write(generate_valid_json_from_file(file_path)) + outlet_events[gene_extenal_links_asset].add(Asset(f"file://{gene_external_links_path}")) + + shutil.rmtree(input_path) + print("Finished") + + chain(process_gene_external_links(), process_temp_folder()) +impc_gene_external_links_mapper() From 5d28c4def98f912f78e6427ca364600e49763d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 23 Oct 2025 13:58:45 +0100 Subject: [PATCH 04/51] feat: rename functions --- impc_etl/jobs/load/impc_spa/impc_chromosome_mapper.py | 2 +- .../jobs/load/impc_spa/impc_gene_external_links_mapper.py | 4 ++-- impc_etl/jobs/load/impc_spa/impc_gene_summaries_mapper.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_chromosome_mapper.py b/impc_etl/jobs/load/impc_spa/impc_chromosome_mapper.py index efb58404..f5adbc12 100644 --- a/impc_etl/jobs/load/impc_spa/impc_chromosome_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_chromosome_mapper.py @@ -20,7 +20,7 @@ tags=["impc_spa", "chromosome map"], ) @with_spark_session -def impc_chromosome_mapper(): +def impc_spa_chromosome_mapper(): import json from pyspark.sql import SparkSession from urllib.parse import unquote, urlparse diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py index 1e1ba86f..614a2d92 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py @@ -20,7 +20,7 @@ ), tags=["impc_spa", "external links"], ) -def impc_gene_external_links_mapper(): +def impc_spa_gene_external_links_mapper(): @with_spark_session @task def process_gene_external_links(): @@ -86,4 +86,4 @@ def generate_valid_json_from_file(file_path): print("Finished") chain(process_gene_external_links(), process_temp_folder()) -impc_gene_external_links_mapper() +impc_spa_gene_external_links_mapper() diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_summaries_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_summaries_mapper.py index 1fb00e23..0d4479c8 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_summaries_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_summaries_mapper.py @@ -20,7 +20,7 @@ ), tags=["impc_spa", "gene", "summary"], ) -def impc_gene_summaries_mapper(): +def impc_spa_gene_summaries_mapper(): @with_spark_session @task(outlets=[gene_summaries_asset]) def process_gene_summaries(*, outlet_events): @@ -52,4 +52,4 @@ def process_gene_summaries(*, outlet_events): gene_file.write(json.dumps(gene)) outlet_events[gene_summaries_asset].add(Asset(f"file://{gene_summary_path}")) process_gene_summaries() -impc_gene_summaries_mapper() +impc_spa_gene_summaries_mapper() From 0d6236efa4da5e79be377901e39f9f46eac4d4fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 23 Oct 2025 14:32:53 +0100 Subject: [PATCH 05/51] chore: add init file to impc_spa dir --- impc_etl/jobs/load/impc_spa/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 impc_etl/jobs/load/impc_spa/__init__.py diff --git a/impc_etl/jobs/load/impc_spa/__init__.py b/impc_etl/jobs/load/impc_spa/__init__.py new file mode 100644 index 00000000..e69de29b From d25f769098b4cee946155eabb08529e6671d460b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 27 Oct 2025 16:18:56 +0000 Subject: [PATCH 06/51] fix: impc_gene_external_links_mapper, repartition by mgi ID to ensure one file per ID --- impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py index 614a2d92..b7b93b47 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py @@ -40,6 +40,7 @@ def process_gene_external_links(): result_df = result_df.drop("chromosome") result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) (result_df + .repartition("mgiGeneAccessionId") .write .option("header", True) .mode("overwrite") From fb7d9be47e4669f549657c15d71c245bc8bcd6ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 10:44:16 +0000 Subject: [PATCH 07/51] feat: generalize functions to partition data and create files --- impc_etl/utils/impc_spa.py | 73 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 impc_etl/utils/impc_spa.py diff --git a/impc_etl/utils/impc_spa.py b/impc_etl/utils/impc_spa.py new file mode 100644 index 00000000..45f12db9 --- /dev/null +++ b/impc_etl/utils/impc_spa.py @@ -0,0 +1,73 @@ + +def write_partitioned_data( + spark, + chromosome_map_path: str, + parquet_path: str, + col_to_filter: str, + temp_folder_path: str +): + import json + from pyspark.sql.functions import col, regexp_replace + from urllib.parse import unquote, urlparse + from impc_etl.utils.airflow import get_data_release_work_dir + + + chromosome_map_json_path = unquote(urlparse(chromosome_map_path).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + parquet_df = spark.read.json(parquet_path) + result_df = chromosome_map_df.join(parquet_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(col(col_to_filter).isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .repartition("mgiGeneAccessionId") + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/{temp_folder_path}") + ) + + +def generate_valid_json_from_file(file_path): + file_data = open(file_path, 'r') + lines = file_data.readlines() + return f"[{','.join(lines)}]" + +def process_temp_folder_into_files( + chromosome_map_path: str, + temp_folder_path: str, + asset_alias, + outlet_events +): + from glob import iglob + from urllib.parse import unquote, urlparse + from airflow.sdk import Asset + from impc_etl.utils.airflow import get_data_release_work_dir + import os + import json + import shutil + + chromosome_map_json_path = unquote(urlparse(chromosome_map_path).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/{temp_folder_path}" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + file_path = f"{gene_dir_path}/external-links.json" + with open(file_path, "w") as gene_file: + gene_file.write(generate_valid_json_from_file(file_path)) + outlet_events[asset_alias].add(Asset(f"file://{file_path}")) + shutil.rmtree(input_path) \ No newline at end of file From 0027298c93669e2dbc85cdbba6139480ff179a77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 10:44:43 +0000 Subject: [PATCH 08/51] feat: update gene external links task to use new functions --- .../impc_gene_external_links_mapper.py | 74 +++++-------------- 1 file changed, 17 insertions(+), 57 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py index b7b93b47..879f14f3 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py @@ -1,8 +1,8 @@ -import json import logging import textwrap from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset -from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.airflow import create_input_asset +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -10,7 +10,7 @@ external_links_json_asset = create_input_asset("output/impc_web_api/external_links_json") chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") -gene_extenal_links_asset = AssetAlias("impc_spa_external_links") +gene_external_links_asset = AssetAlias("impc_spa_external_links") @dag( schedule=[external_links_json_asset, chromosome_map_json_asset], @@ -25,65 +25,25 @@ def impc_spa_gene_external_links_mapper(): @task def process_gene_external_links(): from pyspark.sql import SparkSession - from pyspark.sql.functions import regexp_replace - from urllib.parse import unquote, urlparse spark = SparkSession.builder.getOrCreate() - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) - gene_summary_service_json_path = external_links_json_asset.uri - external_links_df = spark.read.json(gene_summary_service_json_path) - result_df = chromosome_map_df.join(external_links_df, "mgiGeneAccessionId", "left_outer") - result_df = result_df.filter(result_df.href.isNotNull()) - result_df = result_df.drop("chromosome") - result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) - (result_df - .repartition("mgiGeneAccessionId") - .write - .option("header", True) - .mode("overwrite") - .partitionBy("mgiGeneAccessionId") - .json(f"{get_data_release_work_dir()}/output/impc_spa/external_links_temp_json") - ) + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=external_links_json_asset.uri, + col_to_filter="href", + temp_folder_path="external_links_temp_json" + ) print("Finished") - @task(outlets=[gene_extenal_links_asset]) + @task(outlets=[gene_external_links_asset]) def process_temp_folder(*, outlet_events): - import os - import shutil - from glob import iglob - from urllib.parse import unquote, urlparse - - def generate_valid_json_from_file(file_path): - file_data = open(file_path, 'r') - lines = file_data.readlines() - return f"[{','.join(lines)}]" - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - - input_path = f"{get_data_release_work_dir()}/output/impc_spa/external_links_temp_json" - output_path = f"{get_data_release_work_dir()}/output/impc_spa" - for file_path in iglob(f"{input_path}/**/*.json"): - filepath_parts = file_path.split("/") - filepath_parts.pop() - parent_dir = filepath_parts.pop() - mgi_gene_accession_id = parent_dir.split("=")[1] - original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") - - chromosome = chromosome_map_json[original_mgi_gene_accession_id] - chromosome_folder = f"{output_path}/{chromosome}" - os.makedirs(chromosome_folder, exist_ok=True) - - gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - gene_external_links_path = f"{gene_dir_path}/external-links.json" - with open(gene_external_links_path, "w") as gene_file: - gene_file.write(generate_valid_json_from_file(file_path)) - outlet_events[gene_extenal_links_asset].add(Asset(f"file://{gene_external_links_path}")) - - shutil.rmtree(input_path) + process_temp_folder_into_files( + chromosome_map_path=chromosome_map_json_asset.uri, + temp_folder_path="external_links_temp_json", + asset_alias=gene_external_links_asset, + outlet_events=outlet_events + ) print("Finished") chain(process_gene_external_links(), process_temp_folder()) From ea27b64b76a6503704b4cb2033a5cbb73113dac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 15:01:55 +0000 Subject: [PATCH 09/51] fix: add missing param, update gene external links mapper task --- impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py | 1 + impc_etl/utils/impc_spa.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py index 879f14f3..503f8376 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_external_links_mapper.py @@ -41,6 +41,7 @@ def process_temp_folder(*, outlet_events): process_temp_folder_into_files( chromosome_map_path=chromosome_map_json_asset.uri, temp_folder_path="external_links_temp_json", + file_name="external-links.json", asset_alias=gene_external_links_asset, outlet_events=outlet_events ) diff --git a/impc_etl/utils/impc_spa.py b/impc_etl/utils/impc_spa.py index 45f12db9..e7055a53 100644 --- a/impc_etl/utils/impc_spa.py +++ b/impc_etl/utils/impc_spa.py @@ -38,6 +38,7 @@ def generate_valid_json_from_file(file_path): def process_temp_folder_into_files( chromosome_map_path: str, temp_folder_path: str, + file_name: str, asset_alias, outlet_events ): @@ -66,7 +67,7 @@ def process_temp_folder_into_files( os.makedirs(chromosome_folder, exist_ok=True) gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - file_path = f"{gene_dir_path}/external-links.json" + file_path = f"{gene_dir_path}/{file_name}" with open(file_path, "w") as gene_file: gene_file.write(generate_valid_json_from_file(file_path)) outlet_events[asset_alias].add(Asset(f"file://{file_path}")) From e99bf96e43f14a7de5bf04a31fcc83dcf5684ad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Fri, 24 Oct 2025 14:38:03 +0100 Subject: [PATCH 10/51] WIP --- .../load/impc_spa/impc_diseases_mapper.py | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py b/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py new file mode 100644 index 00000000..529e99d3 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py @@ -0,0 +1,90 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gene_diseases_service_json_asset = create_input_asset("output/impc_web_api/gene_diseases_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_diseases_asset = AssetAlias("impc_spa_diseases") + +@dag( + schedule=[gene_diseases_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_diseases_mapper", + description=textwrap.dedent( + """IMPC SPA diseases mapper DAG.""" + ), + tags=["impc_spa", "diseases"], +) +def impc_spa_gene_diseases_mapper(): + @with_spark_session + @task + def process_gene_diseases(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + gene_diseases_service_json_path = gene_diseases_service_json_asset.uri + gene_diseases_df = spark.read.json(gene_diseases_service_json_path) + result_df = chromosome_map_df.join(gene_diseases_df, "mgiGeneAccessionId", "left_outer") + result_df.show() + # result_df = result_df.filter(result_df.href.isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId", "associationCurated") + .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_diseases_temp_json") + ) + print("Finished") + + @task(outlets=[gene_diseases_asset]) + def process_temp_folder(*, outlet_events): + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + # def generate_valid_json_from_file(file_path): + # file_data = open(file_path, 'r') + # lines = file_data.readlines() + # return f"[{','.join(lines)}]" + # + # chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + # chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + # + # input_path = f"{get_data_release_work_dir()}/output/impc_spa/external_links_temp_json" + # output_path = f"{get_data_release_work_dir()}/output/impc_spa" + # for file_path in iglob(f"{input_path}/**/*.json"): + # filepath_parts = file_path.split("/") + # filepath_parts.pop() + # parent_dir = filepath_parts.pop() + # mgi_gene_accession_id = parent_dir.split("=")[1] + # original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + # + # chromosome = chromosome_map_json[original_mgi_gene_accession_id] + # chromosome_folder = f"{output_path}/{chromosome}" + # os.makedirs(chromosome_folder, exist_ok=True) + # + # gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + # gene_external_links_path = f"{gene_dir_path}/external-links.json" + # with open(gene_external_links_path, "w") as gene_file: + # gene_file.write(generate_valid_json_from_file(file_path)) + # outlet_events[gene_diseases_asset].add(Asset(f"file://{gene_external_links_path}")) + # + # shutil.rmtree(input_path) + print("Finished") + + chain(process_gene_diseases(), process_temp_folder()) +impc_spa_gene_diseases_mapper() From 36f820456adebff5900469c87abb99007904a843 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 27 Oct 2025 16:13:36 +0000 Subject: [PATCH 11/51] feat: initial Airflow task to generate gene publications.json files --- .../impc_spa/impc_gene_publications_mapper.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py new file mode 100644 index 00000000..316c8af2 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py @@ -0,0 +1,92 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +publications_service_json_asset = create_input_asset("output/impc_web_api/publications_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_publications_asset = AssetAlias("impc_spa_gene_publications") + +@dag( + schedule=[publications_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_publications_mapper", + description=textwrap.dedent( + """IMPC SPA gene publications mapper DAG.""" + ), + tags=["impc_spa", "gene", "publications"], +) +def impc_spa_gene_publications_mapper(): + @with_spark_session + @task + def process_gene_publications(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import explode, regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + publications_service_json_path = publications_service_json_asset.uri + publications_df = spark.read.json(publications_service_json_path) + publications_df = publications_df.withColumn("allele_fields", explode("alleles")) + publications_df = publications_df.select("*", "allele_fields.*") + result_df = chromosome_map_df.join(publications_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(result_df.alleles.isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .repartition("mgiGeneAccessionId") + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/publications_temp_json") + ) + print("Finished") + + @task(outlets=[gene_publications_asset]) + def process_temp_folder(*, outlet_events): + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + def generate_valid_json_from_file(file_path): + file_data = open(file_path, 'r') + lines = file_data.readlines() + return f"[{','.join(lines)}]" + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/publications_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + gene_publications_path = f"{gene_dir_path}/publications.json" + with open(gene_publications_path, "w") as gene_file: + gene_file.write(generate_valid_json_from_file(file_path)) + outlet_events[gene_publications_asset].add(Asset(f"file://{gene_publications_path}")) + + shutil.rmtree(input_path) + print("Finished") + + chain(process_gene_publications(), process_temp_folder()) +impc_spa_gene_publications_mapper() From a852a0eb19470f0dbaf2f3e4fbb16d6c94465c1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 15:12:02 +0000 Subject: [PATCH 12/51] feat: update gene publications mapper to use utils function --- .../impc_spa/impc_gene_publications_mapper.py | 41 ++++--------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py index 316c8af2..ef552575 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py @@ -3,6 +3,7 @@ import textwrap from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.impc_spa import process_temp_folder_into_files from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -53,39 +54,13 @@ def process_gene_publications(): @task(outlets=[gene_publications_asset]) def process_temp_folder(*, outlet_events): - import os - import shutil - from glob import iglob - from urllib.parse import unquote, urlparse - - def generate_valid_json_from_file(file_path): - file_data = open(file_path, 'r') - lines = file_data.readlines() - return f"[{','.join(lines)}]" - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - - input_path = f"{get_data_release_work_dir()}/output/impc_spa/publications_temp_json" - output_path = f"{get_data_release_work_dir()}/output/impc_spa" - for file_path in iglob(f"{input_path}/**/*.json"): - filepath_parts = file_path.split("/") - filepath_parts.pop() - parent_dir = filepath_parts.pop() - mgi_gene_accession_id = parent_dir.split("=")[1] - original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") - - chromosome = chromosome_map_json[original_mgi_gene_accession_id] - chromosome_folder = f"{output_path}/{chromosome}" - os.makedirs(chromosome_folder, exist_ok=True) - - gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - gene_publications_path = f"{gene_dir_path}/publications.json" - with open(gene_publications_path, "w") as gene_file: - gene_file.write(generate_valid_json_from_file(file_path)) - outlet_events[gene_publications_asset].add(Asset(f"file://{gene_publications_path}")) - - shutil.rmtree(input_path) + process_temp_folder_into_files( + chromosome_map_path=chromosome_map_json_asset.uri, + temp_folder_path="publications_temp_json", + file_name="publications.json", + asset_alias=gene_publications_asset, + outlet_events=outlet_events + ) print("Finished") chain(process_gene_publications(), process_temp_folder()) From 695fa2d271e60bdec42b118fe814af2d8017beba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Fri, 24 Oct 2025 14:38:03 +0100 Subject: [PATCH 13/51] WIP --- .../load/impc_spa/impc_diseases_mapper.py | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py b/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py new file mode 100644 index 00000000..529e99d3 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py @@ -0,0 +1,90 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gene_diseases_service_json_asset = create_input_asset("output/impc_web_api/gene_diseases_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_diseases_asset = AssetAlias("impc_spa_diseases") + +@dag( + schedule=[gene_diseases_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_diseases_mapper", + description=textwrap.dedent( + """IMPC SPA diseases mapper DAG.""" + ), + tags=["impc_spa", "diseases"], +) +def impc_spa_gene_diseases_mapper(): + @with_spark_session + @task + def process_gene_diseases(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + gene_diseases_service_json_path = gene_diseases_service_json_asset.uri + gene_diseases_df = spark.read.json(gene_diseases_service_json_path) + result_df = chromosome_map_df.join(gene_diseases_df, "mgiGeneAccessionId", "left_outer") + result_df.show() + # result_df = result_df.filter(result_df.href.isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId", "associationCurated") + .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_diseases_temp_json") + ) + print("Finished") + + @task(outlets=[gene_diseases_asset]) + def process_temp_folder(*, outlet_events): + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + # def generate_valid_json_from_file(file_path): + # file_data = open(file_path, 'r') + # lines = file_data.readlines() + # return f"[{','.join(lines)}]" + # + # chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + # chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + # + # input_path = f"{get_data_release_work_dir()}/output/impc_spa/external_links_temp_json" + # output_path = f"{get_data_release_work_dir()}/output/impc_spa" + # for file_path in iglob(f"{input_path}/**/*.json"): + # filepath_parts = file_path.split("/") + # filepath_parts.pop() + # parent_dir = filepath_parts.pop() + # mgi_gene_accession_id = parent_dir.split("=")[1] + # original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + # + # chromosome = chromosome_map_json[original_mgi_gene_accession_id] + # chromosome_folder = f"{output_path}/{chromosome}" + # os.makedirs(chromosome_folder, exist_ok=True) + # + # gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + # gene_external_links_path = f"{gene_dir_path}/external-links.json" + # with open(gene_external_links_path, "w") as gene_file: + # gene_file.write(generate_valid_json_from_file(file_path)) + # outlet_events[gene_diseases_asset].add(Asset(f"file://{gene_external_links_path}")) + # + # shutil.rmtree(input_path) + print("Finished") + + chain(process_gene_diseases(), process_temp_folder()) +impc_spa_gene_diseases_mapper() From 66eaf3c1ae28feef3805b68767d0d467e529150a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Fri, 24 Oct 2025 14:45:11 +0100 Subject: [PATCH 14/51] feat: initial task to create gene-histopathology json files --- .../impc_gene_histopathology_mapper.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py new file mode 100644 index 00000000..bbff8b5a --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py @@ -0,0 +1,89 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gene_histopathology_service_json_asset = create_input_asset("output/impc_web_api/gene_histopathology_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_histopathology_asset = AssetAlias("impc_spa_gene_histopathology") + +@dag( + schedule=[gene_histopathology_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_histopathology_mapper", + description=textwrap.dedent( + """IMPC SPA gene histopathology mapper DAG.""" + ), + tags=["impc_spa", "gene", "histopathology"], +) +def impc_spa_gene_histopathology_mapper(): + @with_spark_session + @task + def process_gene_histopathology(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + gene_histopathology_service_json_path = gene_histopathology_service_json_asset.uri + gene_histopathology_df = spark.read.json(gene_histopathology_service_json_path) + result_df = chromosome_map_df.join(gene_histopathology_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(result_df.alleleAccessionId.isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_histopathology_temp_json") + ) + print("Finished") + + @task(outlets=[gene_histopathology_asset]) + def process_temp_folder(*, outlet_events): + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + def generate_valid_json_from_file(file_path): + file_data = open(file_path, 'r') + lines = file_data.readlines() + return f"[{','.join(lines)}]" + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/gene_histopathology_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + gene_histopathology_path = f"{gene_dir_path}/gene-histopathology.json" + with open(gene_histopathology_path, "w") as gene_file: + gene_file.write(generate_valid_json_from_file(file_path)) + outlet_events[gene_histopathology_asset].add(Asset(f"file://{gene_histopathology_path}")) + + shutil.rmtree(input_path) + print("Finished") + + chain(process_gene_histopathology(), process_temp_folder()) +impc_spa_gene_histopathology_mapper() From 3a775f3dfbbdcd8eb41e22541ae836a01966481b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 27 Oct 2025 16:19:59 +0000 Subject: [PATCH 15/51] fix: ensure one file per gene --- impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py index bbff8b5a..eb0b9988 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py @@ -40,6 +40,7 @@ def process_gene_histopathology(): result_df = result_df.drop("chromosome") result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) (result_df + .repartition("mgiGeneAccessionId") .write .option("header", True) .mode("overwrite") From 1a8577ea94f0fe0e01b712414210562994bc94cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 15:16:08 +0000 Subject: [PATCH 16/51] fix: remove diseases mapper from this branch --- .../load/impc_spa/impc_diseases_mapper.py | 90 ------------------- 1 file changed, 90 deletions(-) delete mode 100644 impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py b/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py deleted file mode 100644 index 529e99d3..00000000 --- a/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py +++ /dev/null @@ -1,90 +0,0 @@ -import json -import logging -import textwrap -from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset -from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir -from impc_etl.utils.spark import with_spark_session - -task_logger = logging.getLogger("airflow.task") -dr_tag = Variable.get("data_release_tag") - -gene_diseases_service_json_asset = create_input_asset("output/impc_web_api/gene_diseases_service_json") -chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") -gene_diseases_asset = AssetAlias("impc_spa_diseases") - -@dag( - schedule=[gene_diseases_service_json_asset, chromosome_map_json_asset], - dag_id=f"{dr_tag}_impc_spa_diseases_mapper", - description=textwrap.dedent( - """IMPC SPA diseases mapper DAG.""" - ), - tags=["impc_spa", "diseases"], -) -def impc_spa_gene_diseases_mapper(): - @with_spark_session - @task - def process_gene_diseases(): - from pyspark.sql import SparkSession - from pyspark.sql.functions import regexp_replace - from urllib.parse import unquote, urlparse - - spark = SparkSession.builder.getOrCreate() - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) - gene_diseases_service_json_path = gene_diseases_service_json_asset.uri - gene_diseases_df = spark.read.json(gene_diseases_service_json_path) - result_df = chromosome_map_df.join(gene_diseases_df, "mgiGeneAccessionId", "left_outer") - result_df.show() - # result_df = result_df.filter(result_df.href.isNotNull()) - result_df = result_df.drop("chromosome") - result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) - (result_df - .write - .option("header", True) - .mode("overwrite") - .partitionBy("mgiGeneAccessionId", "associationCurated") - .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_diseases_temp_json") - ) - print("Finished") - - @task(outlets=[gene_diseases_asset]) - def process_temp_folder(*, outlet_events): - import os - import shutil - from glob import iglob - from urllib.parse import unquote, urlparse - - # def generate_valid_json_from_file(file_path): - # file_data = open(file_path, 'r') - # lines = file_data.readlines() - # return f"[{','.join(lines)}]" - # - # chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - # chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - # - # input_path = f"{get_data_release_work_dir()}/output/impc_spa/external_links_temp_json" - # output_path = f"{get_data_release_work_dir()}/output/impc_spa" - # for file_path in iglob(f"{input_path}/**/*.json"): - # filepath_parts = file_path.split("/") - # filepath_parts.pop() - # parent_dir = filepath_parts.pop() - # mgi_gene_accession_id = parent_dir.split("=")[1] - # original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") - # - # chromosome = chromosome_map_json[original_mgi_gene_accession_id] - # chromosome_folder = f"{output_path}/{chromosome}" - # os.makedirs(chromosome_folder, exist_ok=True) - # - # gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - # gene_external_links_path = f"{gene_dir_path}/external-links.json" - # with open(gene_external_links_path, "w") as gene_file: - # gene_file.write(generate_valid_json_from_file(file_path)) - # outlet_events[gene_diseases_asset].add(Asset(f"file://{gene_external_links_path}")) - # - # shutil.rmtree(input_path) - print("Finished") - - chain(process_gene_diseases(), process_temp_folder()) -impc_spa_gene_diseases_mapper() From 9c38c3913cfcd3e3135cdbfede854c99241adf8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 15:19:57 +0000 Subject: [PATCH 17/51] feat: update task to use utils function --- .../impc_gene_histopathology_mapper.py | 71 +++++-------------- 1 file changed, 16 insertions(+), 55 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py index eb0b9988..1c199384 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_mapper.py @@ -1,8 +1,8 @@ -import json import logging import textwrap from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset -from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.airflow import create_input_asset +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -25,65 +25,26 @@ def impc_spa_gene_histopathology_mapper(): @task def process_gene_histopathology(): from pyspark.sql import SparkSession - from pyspark.sql.functions import regexp_replace - from urllib.parse import unquote, urlparse spark = SparkSession.builder.getOrCreate() - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) - gene_histopathology_service_json_path = gene_histopathology_service_json_asset.uri - gene_histopathology_df = spark.read.json(gene_histopathology_service_json_path) - result_df = chromosome_map_df.join(gene_histopathology_df, "mgiGeneAccessionId", "left_outer") - result_df = result_df.filter(result_df.alleleAccessionId.isNotNull()) - result_df = result_df.drop("chromosome") - result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) - (result_df - .repartition("mgiGeneAccessionId") - .write - .option("header", True) - .mode("overwrite") - .partitionBy("mgiGeneAccessionId") - .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_histopathology_temp_json") - ) + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=gene_histopathology_service_json_asset.uri, + col_to_filter="alleleAccessionId", + temp_folder_path="gene_histopathology_temp_json" + ) print("Finished") @task(outlets=[gene_histopathology_asset]) def process_temp_folder(*, outlet_events): - import os - import shutil - from glob import iglob - from urllib.parse import unquote, urlparse - - def generate_valid_json_from_file(file_path): - file_data = open(file_path, 'r') - lines = file_data.readlines() - return f"[{','.join(lines)}]" - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - - input_path = f"{get_data_release_work_dir()}/output/impc_spa/gene_histopathology_temp_json" - output_path = f"{get_data_release_work_dir()}/output/impc_spa" - for file_path in iglob(f"{input_path}/**/*.json"): - filepath_parts = file_path.split("/") - filepath_parts.pop() - parent_dir = filepath_parts.pop() - mgi_gene_accession_id = parent_dir.split("=")[1] - original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") - - chromosome = chromosome_map_json[original_mgi_gene_accession_id] - chromosome_folder = f"{output_path}/{chromosome}" - os.makedirs(chromosome_folder, exist_ok=True) - - gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - gene_histopathology_path = f"{gene_dir_path}/gene-histopathology.json" - with open(gene_histopathology_path, "w") as gene_file: - gene_file.write(generate_valid_json_from_file(file_path)) - outlet_events[gene_histopathology_asset].add(Asset(f"file://{gene_histopathology_path}")) - - shutil.rmtree(input_path) + process_temp_folder_into_files( + chromosome_map_path=chromosome_map_json_asset.uri, + temp_folder_path="gene_histopathology_temp_json", + file_name="gene-histopathology.json", + asset_alias=gene_histopathology_asset, + outlet_events=outlet_events + ) print("Finished") chain(process_gene_histopathology(), process_temp_folder()) From 89626d8ab17b0fe6f6fb99584c15864387789c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 27 Oct 2025 09:42:11 +0000 Subject: [PATCH 18/51] feat: initial task to create gene images.json file --- .../load/impc_spa/impc_gene_images_mapper.py | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py new file mode 100644 index 00000000..56fb78f4 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py @@ -0,0 +1,90 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gene_images_service_json_asset = create_input_asset("output/impc_web_api/gene_images_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_images_asset = AssetAlias("impc_spa_gene_images") + +@dag( + schedule=[gene_images_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_images_mapper", + description=textwrap.dedent( + """IMPC SPA gene images mapper DAG.""" + ), + tags=["impc_spa", "gene", "images"], +) +def impc_spa_gene_images_mapper(): + @with_spark_session + @task + def process_gene_images(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + gene_images_service_json_path = gene_images_service_json_asset.uri + gene_images_df = spark.read.json(gene_images_service_json_path) + result_df = chromosome_map_df.join(gene_images_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(result_df.thumbnailUrl.isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .repartition("mgiGeneAccessionId") + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_images_temp_json") + ) + print("Finished") + + @task(outlets=[gene_images_asset]) + def process_temp_folder(*, outlet_events): + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + def generate_valid_json_from_file(file_path): + file_data = open(file_path, 'r') + lines = file_data.readlines() + return f"[{','.join(lines)}]" + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/gene_images_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + gene_images_path = f"{gene_dir_path}/images.json" + with open(gene_images_path, "w") as gene_file: + gene_file.write(generate_valid_json_from_file(file_path)) + outlet_events[gene_images_asset].add(Asset(f"file://{gene_images_path}")) + + shutil.rmtree(input_path) + print("Finished") + + chain(process_gene_images(), process_temp_folder()) +impc_spa_gene_images_mapper() From 1b7eeffb1f2ac8d27c5963bd27cd6893d1d23638 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 10:49:33 +0000 Subject: [PATCH 19/51] feat: update task with utils functions --- .../load/impc_spa/impc_gene_images_mapper.py | 67 ++++--------------- 1 file changed, 14 insertions(+), 53 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py index 56fb78f4..2fc770dc 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py @@ -3,6 +3,7 @@ import textwrap from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -25,65 +26,25 @@ def impc_spa_gene_images_mapper(): @task def process_gene_images(): from pyspark.sql import SparkSession - from pyspark.sql.functions import regexp_replace - from urllib.parse import unquote, urlparse spark = SparkSession.builder.getOrCreate() - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) - gene_images_service_json_path = gene_images_service_json_asset.uri - gene_images_df = spark.read.json(gene_images_service_json_path) - result_df = chromosome_map_df.join(gene_images_df, "mgiGeneAccessionId", "left_outer") - result_df = result_df.filter(result_df.thumbnailUrl.isNotNull()) - result_df = result_df.drop("chromosome") - result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) - (result_df - .repartition("mgiGeneAccessionId") - .write - .option("header", True) - .mode("overwrite") - .partitionBy("mgiGeneAccessionId") - .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_images_temp_json") - ) + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=gene_images_service_json_asset.uri, + col_to_filter="thumbnailUrl", + temp_folder_path="gene_images_temp_json" + ) print("Finished") @task(outlets=[gene_images_asset]) def process_temp_folder(*, outlet_events): - import os - import shutil - from glob import iglob - from urllib.parse import unquote, urlparse - - def generate_valid_json_from_file(file_path): - file_data = open(file_path, 'r') - lines = file_data.readlines() - return f"[{','.join(lines)}]" - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - - input_path = f"{get_data_release_work_dir()}/output/impc_spa/gene_images_temp_json" - output_path = f"{get_data_release_work_dir()}/output/impc_spa" - for file_path in iglob(f"{input_path}/**/*.json"): - filepath_parts = file_path.split("/") - filepath_parts.pop() - parent_dir = filepath_parts.pop() - mgi_gene_accession_id = parent_dir.split("=")[1] - original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") - - chromosome = chromosome_map_json[original_mgi_gene_accession_id] - chromosome_folder = f"{output_path}/{chromosome}" - os.makedirs(chromosome_folder, exist_ok=True) - - gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - gene_images_path = f"{gene_dir_path}/images.json" - with open(gene_images_path, "w") as gene_file: - gene_file.write(generate_valid_json_from_file(file_path)) - outlet_events[gene_images_asset].add(Asset(f"file://{gene_images_path}")) - - shutil.rmtree(input_path) + process_temp_folder_into_files( + chromosome_map_path=chromosome_map_json_asset.uri, + temp_folder_path="gene_images_temp_json", + asset_alias=gene_images_asset, + outlet_events=outlet_events + ) print("Finished") chain(process_gene_images(), process_temp_folder()) From 857ec79f53989d3598d5cd03487b189de82469be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 15:24:55 +0000 Subject: [PATCH 20/51] fix: add filename --- impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py index 2fc770dc..5e90b5b1 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_images_mapper.py @@ -42,6 +42,7 @@ def process_temp_folder(*, outlet_events): process_temp_folder_into_files( chromosome_map_path=chromosome_map_json_asset.uri, temp_folder_path="gene_images_temp_json", + file_name="images.json", asset_alias=gene_images_asset, outlet_events=outlet_events ) From 76da083a5d41e0efbb71dc974da13bc640ceff88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 27 Oct 2025 10:55:04 +0000 Subject: [PATCH 21/51] feat: initial Airflow task to generate gene expression json files --- .../impc_spa/impc_gene_expression_mapper.py | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py new file mode 100644 index 00000000..0f633196 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py @@ -0,0 +1,90 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gene_expression_service_json_asset = create_input_asset("output/impc_web_api/gene_expression_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_expression_asset = AssetAlias("impc_spa_gene_expression") + +@dag( + schedule=[gene_expression_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_expression_mapper", + description=textwrap.dedent( + """IMPC SPA gene expression mapper DAG.""" + ), + tags=["impc_spa", "gene", "lac z expression"], +) +def impc_spa_gene_expression_mapper(): + @with_spark_session + @task + def process_gene_expression(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + gene_expression_service_json_path = gene_expression_service_json_asset.uri + gene_expression_df = spark.read.json(gene_expression_service_json_path) + result_df = chromosome_map_df.join(gene_expression_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(result_df.id.isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .repartition("mgiGeneAccessionId") + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_expression_temp_json") + ) + print("Finished") + + @task(outlets=[gene_expression_asset]) + def process_temp_folder(*, outlet_events): + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + def generate_valid_json_from_file(file_path): + file_data = open(file_path, 'r') + lines = file_data.readlines() + return f"[{','.join(lines)}]" + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/gene_expression_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + gene_expression_path = f"{gene_dir_path}/expression.json" + with open(gene_expression_path, "w") as gene_file: + gene_file.write(generate_valid_json_from_file(file_path)) + outlet_events[gene_expression_asset].add(Asset(f"file://{gene_expression_path}")) + + shutil.rmtree(input_path) + print("Finished") + + chain(process_gene_expression(), process_temp_folder()) +impc_spa_gene_expression_mapper() From 26581f40acdbc70856f94efcc1fe46232a4615e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 11:05:56 +0000 Subject: [PATCH 22/51] feat: update task to use utils functions --- .../impc_spa/impc_gene_expression_mapper.py | 70 ++++--------------- 1 file changed, 15 insertions(+), 55 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py index 0f633196..ac277826 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py @@ -1,8 +1,8 @@ -import json import logging import textwrap from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset -from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.airflow import create_input_asset +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -25,65 +25,25 @@ def impc_spa_gene_expression_mapper(): @task def process_gene_expression(): from pyspark.sql import SparkSession - from pyspark.sql.functions import regexp_replace - from urllib.parse import unquote, urlparse spark = SparkSession.builder.getOrCreate() - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) - gene_expression_service_json_path = gene_expression_service_json_asset.uri - gene_expression_df = spark.read.json(gene_expression_service_json_path) - result_df = chromosome_map_df.join(gene_expression_df, "mgiGeneAccessionId", "left_outer") - result_df = result_df.filter(result_df.id.isNotNull()) - result_df = result_df.drop("chromosome") - result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) - (result_df - .repartition("mgiGeneAccessionId") - .write - .option("header", True) - .mode("overwrite") - .partitionBy("mgiGeneAccessionId") - .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_expression_temp_json") - ) + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=gene_expression_service_json_asset.uri, + col_to_filter="id", + temp_folder_path="gene_expression_temp_json" + ) print("Finished") @task(outlets=[gene_expression_asset]) def process_temp_folder(*, outlet_events): - import os - import shutil - from glob import iglob - from urllib.parse import unquote, urlparse - - def generate_valid_json_from_file(file_path): - file_data = open(file_path, 'r') - lines = file_data.readlines() - return f"[{','.join(lines)}]" - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - - input_path = f"{get_data_release_work_dir()}/output/impc_spa/gene_expression_temp_json" - output_path = f"{get_data_release_work_dir()}/output/impc_spa" - for file_path in iglob(f"{input_path}/**/*.json"): - filepath_parts = file_path.split("/") - filepath_parts.pop() - parent_dir = filepath_parts.pop() - mgi_gene_accession_id = parent_dir.split("=")[1] - original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") - - chromosome = chromosome_map_json[original_mgi_gene_accession_id] - chromosome_folder = f"{output_path}/{chromosome}" - os.makedirs(chromosome_folder, exist_ok=True) - - gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - gene_expression_path = f"{gene_dir_path}/expression.json" - with open(gene_expression_path, "w") as gene_file: - gene_file.write(generate_valid_json_from_file(file_path)) - outlet_events[gene_expression_asset].add(Asset(f"file://{gene_expression_path}")) - - shutil.rmtree(input_path) + process_temp_folder_into_files( + chromosome_map_path=chromosome_map_json_asset.uri, + temp_folder_path="gene_expression_temp_json", + asset_alias=gene_expression_asset, + outlet_events=outlet_events + ) print("Finished") chain(process_gene_expression(), process_temp_folder()) From a46bf96c95034e1abb1641aa3bc4d198ebce21d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 15:26:26 +0000 Subject: [PATCH 23/51] fix: add file name --- impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py index ac277826..6c80cd59 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_expression_mapper.py @@ -41,6 +41,7 @@ def process_temp_folder(*, outlet_events): process_temp_folder_into_files( chromosome_map_path=chromosome_map_json_asset.uri, temp_folder_path="gene_expression_temp_json", + file_name="expression.json", asset_alias=gene_expression_asset, outlet_events=outlet_events ) From bc3ff63acdada2d26dfe16beb9fba8c5a21f3fa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 27 Oct 2025 12:07:03 +0000 Subject: [PATCH 24/51] feat: initial Airflow task to produce gene phenotypehits json files --- .../impc_gene_phenotypehits_mapper.py | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py new file mode 100644 index 00000000..03d35c71 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py @@ -0,0 +1,90 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gene_phenotype_hits_service_json_asset = create_input_asset("output/impc_web_api/gene_phenotype_hits_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_phenotype_hits_asset = AssetAlias("impc_spa_gene_phenotype_hits") + +@dag( + schedule=[gene_phenotype_hits_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_phenotypehits_mapper", + description=textwrap.dedent( + """IMPC SPA gene phenotype hits mapper DAG.""" + ), + tags=["impc_spa", "gene", "phenotype hits"], +) +def impc_spa_gene_phenotypehits_mapper(): + @with_spark_session + @task + def process_gene_phenotypehits(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + gene_phenotype_hits_service_json_path = gene_phenotype_hits_service_json_asset.uri + gene_phenotype_hits_df = spark.read.json(gene_phenotype_hits_service_json_path) + result_df = chromosome_map_df.join(gene_phenotype_hits_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(result_df.alleleAccessionId.isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .repartition("mgiGeneAccessionId") + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_phenotypehits_temp_json") + ) + print("Finished") + + @task(outlets=[gene_phenotype_hits_asset]) + def process_temp_folder(*, outlet_events): + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + def generate_valid_json_from_file(file_path): + file_data = open(file_path, 'r') + lines = file_data.readlines() + return f"[{','.join(lines)}]" + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/gene_phenotypehits_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + gene_phenotypehits_path = f"{gene_dir_path}/phenotypehits.json" + with open(gene_phenotypehits_path, "w") as gene_file: + gene_file.write(generate_valid_json_from_file(file_path)) + outlet_events[gene_phenotype_hits_asset].add(Asset(f"file://{gene_phenotypehits_path}")) + + shutil.rmtree(input_path) + print("Finished") + + chain(process_gene_phenotypehits(), process_temp_folder()) +impc_spa_gene_phenotypehits_mapper() From 242725f501e233dc5811df0feb227e75185fda33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 14:56:28 +0000 Subject: [PATCH 25/51] feat: update task to use utils functions --- .../impc_gene_phenotypehits_mapper.py | 66 ++++--------------- 1 file changed, 14 insertions(+), 52 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py index 03d35c71..01ccaf61 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py @@ -3,6 +3,7 @@ import textwrap from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -25,65 +26,26 @@ def impc_spa_gene_phenotypehits_mapper(): @task def process_gene_phenotypehits(): from pyspark.sql import SparkSession - from pyspark.sql.functions import regexp_replace - from urllib.parse import unquote, urlparse spark = SparkSession.builder.getOrCreate() - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) - gene_phenotype_hits_service_json_path = gene_phenotype_hits_service_json_asset.uri - gene_phenotype_hits_df = spark.read.json(gene_phenotype_hits_service_json_path) - result_df = chromosome_map_df.join(gene_phenotype_hits_df, "mgiGeneAccessionId", "left_outer") - result_df = result_df.filter(result_df.alleleAccessionId.isNotNull()) - result_df = result_df.drop("chromosome") - result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) - (result_df - .repartition("mgiGeneAccessionId") - .write - .option("header", True) - .mode("overwrite") - .partitionBy("mgiGeneAccessionId") - .json(f"{get_data_release_work_dir()}/output/impc_spa/gene_phenotypehits_temp_json") - ) + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=gene_phenotype_hits_service_json_asset.uri, + col_to_filter="alleleAccessionId", + temp_folder_path="gene_phenotypehits_temp_json" + ) print("Finished") @task(outlets=[gene_phenotype_hits_asset]) def process_temp_folder(*, outlet_events): - import os - import shutil - from glob import iglob - from urllib.parse import unquote, urlparse - - def generate_valid_json_from_file(file_path): - file_data = open(file_path, 'r') - lines = file_data.readlines() - return f"[{','.join(lines)}]" - - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - - input_path = f"{get_data_release_work_dir()}/output/impc_spa/gene_phenotypehits_temp_json" - output_path = f"{get_data_release_work_dir()}/output/impc_spa" - for file_path in iglob(f"{input_path}/**/*.json"): - filepath_parts = file_path.split("/") - filepath_parts.pop() - parent_dir = filepath_parts.pop() - mgi_gene_accession_id = parent_dir.split("=")[1] - original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") - - chromosome = chromosome_map_json[original_mgi_gene_accession_id] - chromosome_folder = f"{output_path}/{chromosome}" - os.makedirs(chromosome_folder, exist_ok=True) - - gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - gene_phenotypehits_path = f"{gene_dir_path}/phenotypehits.json" - with open(gene_phenotypehits_path, "w") as gene_file: - gene_file.write(generate_valid_json_from_file(file_path)) - outlet_events[gene_phenotype_hits_asset].add(Asset(f"file://{gene_phenotypehits_path}")) - - shutil.rmtree(input_path) + process_temp_folder_into_files( + chromosome_map_path=chromosome_map_json_asset.uri, + temp_folder_path="gene_phenotypehits_temp_json", + asset_alias=gene_phenotype_hits_asset, + outlet_events=outlet_events + ) print("Finished") chain(process_gene_phenotypehits(), process_temp_folder()) From bfb9d6613ca8aea50986e570e0959834a3c2ffd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 15:27:22 +0000 Subject: [PATCH 26/51] fix: add file name --- impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py index 01ccaf61..00e9effc 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_phenotypehits_mapper.py @@ -43,6 +43,7 @@ def process_temp_folder(*, outlet_events): process_temp_folder_into_files( chromosome_map_path=chromosome_map_json_asset.uri, temp_folder_path="gene_phenotypehits_temp_json", + file_name="phenotypehits.json", asset_alias=gene_phenotype_hits_asset, outlet_events=outlet_events ) From f987c8aa41751a6183fd84e14b045d0a2054d07a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 28 Oct 2025 16:53:41 +0000 Subject: [PATCH 27/51] feat: initial task that generates stats-results json files --- .../impc_gene_stats_results_mapper.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_stats_results_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_stats_results_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_stats_results_mapper.py new file mode 100644 index 00000000..0d1b9f71 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_stats_results_mapper.py @@ -0,0 +1,52 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gene_statistical_results_service_json_asset = create_input_asset("output/impc_web_api/gene_statistical_results_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_stats_results_links_asset = AssetAlias("impc_spa_gene_stats_results") + +@dag( + schedule=[gene_statistical_results_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_stats_results_mapper", + description=textwrap.dedent( + """IMPC SPA gene statistical results DAG.""" + ), + tags=["impc_spa", "gene", "statatistical results"], +) +def impc_spa_gene_stats_results_mapper(): + @with_spark_session + @task + def process_gene_stats_results(): + from pyspark.sql import SparkSession + + spark = SparkSession.builder.getOrCreate() + + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=gene_statistical_results_service_json_asset.uri, + col_to_filter="alleleAccessionId", + temp_folder_path="stats_results_temp_json" + ) + print("Finished") + + @task(outlets=[gene_stats_results_links_asset]) + def process_temp_folder(*, outlet_events): + process_temp_folder_into_files( + chromosome_map_path=chromosome_map_json_asset.uri, + temp_folder_path="stats_results_temp_json", + file_name="stats-results.json", + asset_alias=gene_stats_results_links_asset, + outlet_events=outlet_events + ) + print("Finished") + + chain(process_gene_stats_results(), process_temp_folder()) +impc_spa_gene_stats_results_mapper() \ No newline at end of file From e1f965737cfe5476f7dc4d3a05735104d9985c62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Wed, 29 Oct 2025 09:56:20 +0000 Subject: [PATCH 28/51] feat: improve write_partitioned_data function, add argument to mutate dataframe --- impc_etl/utils/impc_spa.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/impc_etl/utils/impc_spa.py b/impc_etl/utils/impc_spa.py index e7055a53..b4086b0d 100644 --- a/impc_etl/utils/impc_spa.py +++ b/impc_etl/utils/impc_spa.py @@ -4,7 +4,8 @@ def write_partitioned_data( chromosome_map_path: str, parquet_path: str, col_to_filter: str, - temp_folder_path: str + temp_folder_path: str, + filtered_dataframe_fn: callable = lambda df: df, ): import json from pyspark.sql.functions import col, regexp_replace @@ -16,7 +17,8 @@ def write_partitioned_data( chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) parquet_df = spark.read.json(parquet_path) - result_df = chromosome_map_df.join(parquet_df, "mgiGeneAccessionId", "left_outer") + result_df = filtered_dataframe_fn(parquet_df) + result_df = chromosome_map_df.join(result_df, "mgiGeneAccessionId", "left_outer") result_df = result_df.filter(col(col_to_filter).isNotNull()) result_df = result_df.drop("chromosome") result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) From 01490befb9d17ab006c9d6ccc941f69f75bc5a14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Wed, 29 Oct 2025 09:57:13 +0000 Subject: [PATCH 29/51] feat: update gene publications mapper, use improved utils function --- .../impc_spa/impc_gene_publications_mapper.py | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py index ef552575..323045ca 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_publications_mapper.py @@ -1,9 +1,8 @@ -import json import logging import textwrap from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset -from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir -from impc_etl.utils.impc_spa import process_temp_folder_into_files +from impc_etl.utils.airflow import create_input_asset +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -26,30 +25,24 @@ def impc_spa_gene_publications_mapper(): @task def process_gene_publications(): from pyspark.sql import SparkSession - from pyspark.sql.functions import explode, regexp_replace - from urllib.parse import unquote, urlparse + from pyspark.sql.functions import explode + + def process_allele_col(df): + df = df.withColumn("allele_fields", explode("alleles")) + df = df.select("*", "allele_fields.*") + return df spark = SparkSession.builder.getOrCreate() - chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) - publications_service_json_path = publications_service_json_asset.uri - publications_df = spark.read.json(publications_service_json_path) - publications_df = publications_df.withColumn("allele_fields", explode("alleles")) - publications_df = publications_df.select("*", "allele_fields.*") - result_df = chromosome_map_df.join(publications_df, "mgiGeneAccessionId", "left_outer") - result_df = result_df.filter(result_df.alleles.isNotNull()) - result_df = result_df.drop("chromosome") - result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) - (result_df - .repartition("mgiGeneAccessionId") - .write - .option("header", True) - .mode("overwrite") - .partitionBy("mgiGeneAccessionId") - .json(f"{get_data_release_work_dir()}/output/impc_spa/publications_temp_json") - ) + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=publications_service_json_asset.uri, + col_to_filter="alleles", + temp_folder_path="publications_temp_json", + filtered_dataframe_fn=process_allele_col + + ) print("Finished") @task(outlets=[gene_publications_asset]) From 326d1667e0bd535586593d3900da7c86e5de648a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 30 Oct 2025 13:57:40 +0000 Subject: [PATCH 30/51] feat: added task to generate phenotype summaries --- .../impc_phenotype_summaries_mapper.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_phenotype_summaries_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_phenotype_summaries_mapper.py b/impc_etl/jobs/load/impc_spa/impc_phenotype_summaries_mapper.py new file mode 100644 index 00000000..8edeadf9 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_phenotype_summaries_mapper.py @@ -0,0 +1,45 @@ +import json +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, Asset, Metadata, dag +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +phenotype_summary_service_json_asset = create_input_asset("output/impc_web_api/phenotype_summary_service_json") +phenotype_summaries_asset = AssetAlias("impc_spa_phenotype_summaries") + +@dag( + schedule=[phenotype_summary_service_json_asset], + dag_id=f"{dr_tag}_impc_spa_phenotype_summaries_mapper", + description=textwrap.dedent( + """IMPC SPA phenotype summaries mapper DAG.""" + ), + tags=["impc_spa", "phenotype", "summary"], +) +def impc_spa_phenotype_summaries_mapper(): + @with_spark_session + @task(outlets=[phenotype_summaries_asset]) + def process_phenotype_summaries(*, outlet_events): + import os + from pyspark.sql import SparkSession + + spark = SparkSession.builder.getOrCreate() + + phenotype_summary_service_json_path = phenotype_summary_service_json_asset.uri + phenotype_summary_df = spark.read.json(phenotype_summary_service_json_path) + phenotype_list = map(lambda row: row.asDict(), phenotype_summary_df.collect()) + output_path = f"{get_data_release_work_dir()}/output/impc_spa/phenotypes/" + os.makedirs(output_path, exist_ok=True) + for phenotype in phenotype_list: + phenotypeId = phenotype["phenotypeId"].replace(":", "_") + phenotype_dir_path = f"{output_path}/{phenotypeId}" + os.makedirs(phenotype_dir_path, exist_ok=True) + phenotype_summary_path = f"{phenotype_dir_path}/summary.json" + with open(phenotype_summary_path, "w") as phenotype_file: + phenotype_file.write(json.dumps(phenotype)) + outlet_events[phenotype_summaries_asset].add(Asset(f"file://{phenotype_summary_path}")) + process_phenotype_summaries() +impc_spa_phenotype_summaries_mapper() From 381e404b8da785967e2524dc555a8ae15946ed53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 30 Oct 2025 14:53:27 +0000 Subject: [PATCH 31/51] feat: added Airflow task to generate phenotype genotypehits json file --- .../impc_phenotype_genotypehits_mapper.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_phenotype_genotypehits_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_phenotype_genotypehits_mapper.py b/impc_etl/jobs/load/impc_spa/impc_phenotype_genotypehits_mapper.py new file mode 100644 index 00000000..90bcdeb9 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_phenotype_genotypehits_mapper.py @@ -0,0 +1,68 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, dag, chain, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +phenotype_genotype_hits_service_json_asset = create_input_asset("output/impc_web_api/phenotype_genotype_hits_service_json") +phenotype_genotypehits_asset = AssetAlias("impc_spa_phenotype_genotypehits") + +@dag( + schedule=[phenotype_genotype_hits_service_json_asset], + dag_id=f"{dr_tag}_impc_spa_phenotype_genotypehits_mapper", + description=textwrap.dedent( + """IMPC SPA phenotype genotypehits mapper DAG.""" + ), + tags=["impc_spa", "phenotype", "genotypehits"], +) +def impc_spa_phenotype_summaries_mapper(): + @with_spark_session + @task + def process_phenotype_genotypehits_parquet(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import regexp_replace + + spark = SparkSession.builder.getOrCreate() + + phenotype_genotype_hits_service_json_path = phenotype_genotype_hits_service_json_asset.uri + phenotype_genotypehits_df = spark.read.json(phenotype_genotype_hits_service_json_path) + result_df = phenotype_genotypehits_df.withColumn("phenotypeId", regexp_replace("phenotypeId", ":", "_")) + (result_df + .repartition("phenotypeId") + .write + .mode("overwrite") + .partitionBy("phenotypeId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/phenotype_genotypehits_temp_json") + ) + + @task(outlets=[phenotype_genotypehits_asset]) + def process_temp_folder(*, outlet_events): + import os + import shutil + from glob import iglob + def generate_valid_json_from_file(file_path): + file_data = open(file_path, 'r') + lines = file_data.readlines() + return f"[{','.join(lines)}]" + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/phenotype_genotypehits_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa/phenotypes/" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + phenotype_id = parent_dir.split("=")[1] + + phenotype_dir_path = f"{output_path}/{phenotype_id}" + os.makedirs(phenotype_dir_path, exist_ok=True) + phenotype_genotypehits_path = f"{phenotype_dir_path}/genotype-hits.json" + + with open(phenotype_genotypehits_path, "w") as phenotype_file: + phenotype_file.write(generate_valid_json_from_file(file_path)) + outlet_events[phenotype_genotypehits_asset].add(Asset(f"file://{phenotype_genotypehits_path}")) + shutil.rmtree(input_path) + chain(process_phenotype_genotypehits_parquet(), process_temp_folder()) +impc_spa_phenotype_summaries_mapper() From 682ebc4de2beb9a06a0bcfdd99cefe11b57a19f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 4 Nov 2025 17:14:59 +0000 Subject: [PATCH 32/51] chore: move imports outside functions --- impc_etl/utils/impc_spa.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/impc_etl/utils/impc_spa.py b/impc_etl/utils/impc_spa.py index b4086b0d..1e6eecec 100644 --- a/impc_etl/utils/impc_spa.py +++ b/impc_etl/utils/impc_spa.py @@ -1,3 +1,12 @@ +from pyspark.sql.functions import col, regexp_replace +from glob import iglob +from urllib.parse import unquote, urlparse +from airflow.sdk import Asset +from impc_etl.utils.airflow import get_data_release_work_dir +import os +import json +import shutil + def write_partitioned_data( spark, @@ -7,10 +16,6 @@ def write_partitioned_data( temp_folder_path: str, filtered_dataframe_fn: callable = lambda df: df, ): - import json - from pyspark.sql.functions import col, regexp_replace - from urllib.parse import unquote, urlparse - from impc_etl.utils.airflow import get_data_release_work_dir chromosome_map_json_path = unquote(urlparse(chromosome_map_path).path) @@ -44,13 +49,6 @@ def process_temp_folder_into_files( asset_alias, outlet_events ): - from glob import iglob - from urllib.parse import unquote, urlparse - from airflow.sdk import Asset - from impc_etl.utils.airflow import get_data_release_work_dir - import os - import json - import shutil chromosome_map_json_path = unquote(urlparse(chromosome_map_path).path) chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) From 3c9fa94a1dd4ff98c163ba99eac7499c83f39ea0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 4 Nov 2025 17:16:16 +0000 Subject: [PATCH 33/51] fix: use another variable name to store new file path --- impc_etl/utils/impc_spa.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/impc_etl/utils/impc_spa.py b/impc_etl/utils/impc_spa.py index 1e6eecec..b8f92bb9 100644 --- a/impc_etl/utils/impc_spa.py +++ b/impc_etl/utils/impc_spa.py @@ -67,8 +67,9 @@ def process_temp_folder_into_files( os.makedirs(chromosome_folder, exist_ok=True) gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - file_path = f"{gene_dir_path}/{file_name}" - with open(file_path, "w") as gene_file: + os.makedirs(gene_dir_path, exist_ok=True) + file_to_be_generated_path = f"{gene_dir_path}/{file_name}" + with open(file_to_be_generated_path, "w") as gene_file: gene_file.write(generate_valid_json_from_file(file_path)) - outlet_events[asset_alias].add(Asset(f"file://{file_path}")) + outlet_events[asset_alias].add(Asset(f"file://{file_to_be_generated_path}")) shutil.rmtree(input_path) \ No newline at end of file From 5975309852c441e6af893176acd1b23cbc92c9ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Tue, 4 Nov 2025 17:25:46 +0000 Subject: [PATCH 34/51] feat: added Airflow task to generate order.json files --- .../load/impc_spa/impc_gene_order_mapper.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_order_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_order_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_order_mapper.py new file mode 100644 index 00000000..628a10cd --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_order_mapper.py @@ -0,0 +1,50 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, dag, chain +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files +from impc_etl.utils.spark import with_spark_session +from impc_etl.utils.airflow import create_input_asset + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +product_report_json_path_asset = create_input_asset("tracking/gentar-products_order.json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_order_asset = AssetAlias("impc_spa_gene_order") + +@dag( + schedule=[product_report_json_path_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_order_mapper", + description=textwrap.dedent( + """IMPC SPA gene order mapper DAG.""" + ), + tags=["impc_spa", "gene", "order"], +) +def impc_spa_gene_order_mapper(): + @with_spark_session + @task + def process_gene_order_report(): + from pyspark.sql import SparkSession + + spark = SparkSession.builder.getOrCreate() + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=product_report_json_path_asset.uri, + col_to_filter="alleleSymbol", + temp_folder_path="gene_order_temp_json" + ) + print("Finished") + + @task(outlets=[gene_order_asset]) + def process_temp_folder(*, outlet_events): + process_temp_folder_into_files( + chromosome_map_path=chromosome_map_json_asset.uri, + temp_folder_path="gene_order_temp_json", + file_name="order.json", + asset_alias=gene_order_asset, + outlet_events=outlet_events + ) + print("Finished") + chain(process_gene_order_report(), process_temp_folder()) +impc_spa_gene_order_mapper() \ No newline at end of file From c2e7f482779cd81fb706862127a5471f2ec01741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 6 Nov 2025 12:52:07 +0000 Subject: [PATCH 35/51] feat: added Airflow task to generate allele mice json file --- .../impc_spa/impc_gene_allele_mice_mapper.py | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py new file mode 100644 index 00000000..0db9fa67 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py @@ -0,0 +1,86 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, dag, chain + +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.impc_spa import write_partitioned_data +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gentar_products_mice_latest_json_output_asset = create_input_asset("output/impc_web_api/gentar-products_mice-latest.json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") + +@dag( + schedule=[gentar_products_mice_latest_json_output_asset], + dag_id=f"{dr_tag}_impc_spa_gene_allele_mice_mapper", + description=textwrap.dedent( + """IMPC SPA gene allele mice mapper DAG.""" + ), + tags=["impc_spa", "allele", "mice"], +) +def impc_spa_gene_allele_mice_mapper(): + @with_spark_session + @task + def process_allele_mice_data(): + from pyspark.sql import SparkSession + + spark = SparkSession.builder.getOrCreate() + + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=gentar_products_mice_latest_json_output_asset.uri, + col_to_filter="alleleName", + temp_folder_path="allele_mice_temp_json" + ) + print("Finished") + @task + def process_temp_folder(): + import json + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/allele_mice_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + + allele_dict = {} + # JSON file can have multiple objects for distinct alleles + allele_data = open(file_path, 'r') + for json_obj_str in allele_data.readlines(): + allele_obj = json.loads(json_obj_str) + if allele_obj["alleleName"] in allele_dict: + allele_dict[allele_obj["alleleName"]].append(json_obj_str) + else: + allele_dict[allele_obj["alleleName"]] = [json_obj_str] + + + for original_allele_name, allele_json_list in allele_dict.items(): + allele_name = original_allele_name.replace("/", "_") + allele_dir_path = f"{gene_dir_path}/{allele_name}" + os.makedirs(allele_dir_path, exist_ok=True) + file_to_be_generated_path = f"{allele_dir_path}/mice.json" + with open(file_to_be_generated_path, "w") as allele_file: + allele_file.write(f"[{','.join(allele_json_list)}]") + shutil.rmtree(input_path) + print("Finished") + + chain(process_allele_mice_data(), process_temp_folder()) +impc_spa_gene_allele_mice_mapper() From 57ec2a15b424b6cc3c78a26b8da930f885f04223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Fri, 7 Nov 2025 10:56:37 +0000 Subject: [PATCH 36/51] fix: gene allele mice mapper, add chromosome map asset to schedule list --- impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py index 0db9fa67..d1363770 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py @@ -13,7 +13,7 @@ chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") @dag( - schedule=[gentar_products_mice_latest_json_output_asset], + schedule=[gentar_products_mice_latest_json_output_asset, chromosome_map_json_asset], dag_id=f"{dr_tag}_impc_spa_gene_allele_mice_mapper", description=textwrap.dedent( """IMPC SPA gene allele mice mapper DAG.""" From 0ff8796c7ab16914bbaa9976f5fc120dce6ad2c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Fri, 7 Nov 2025 14:34:22 +0000 Subject: [PATCH 37/51] WIP: gene allele Es Cells mapper proper column names need to be defined --- .../impc_gene_allele_escell_mapper.py | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_allele_escell_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_escell_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_escell_mapper.py new file mode 100644 index 00000000..fb7563df --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_escell_mapper.py @@ -0,0 +1,99 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, dag, chain + +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +product_report_parquet_asset = create_input_asset("output/product_report_parquet") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") + +@dag( + schedule=[product_report_parquet_asset], + dag_id=f"{dr_tag}_impc_spa_gene_escell_mapper", + description=textwrap.dedent( + """IMPC SPA gene allele EsCells mapper DAG.""" + ), + tags=["impc_spa", "allele", "escells"], +) +def impc_spa_gene_allele_escell_mapper(): + @with_spark_session + @task + def process_allele_escell_data(): + import json + from pyspark.sql import SparkSession + from pyspark.sql.functions import col, regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + + product_df = spark.read.parquet(product_report_parquet_asset.uri) + product_df = product_df.filter(col("type") == "es_cell") + product_df = product_df.withColumnRenamed("mgi_accession_id", "mgiGeneAccessionId") + + result_df = chromosome_map_df.join(product_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(col("allele_name").isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + + (result_df + .repartition("mgiGeneAccessionId") + .write + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/allele_escell_temp_json") + ) + + @task + def process_temp_folder(): + import json + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/allele_escell_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + + allele_dict = {} + # JSON file can have multiple objects for distinct alleles + allele_data = open(file_path, 'r') + for json_obj_str in allele_data.readlines(): + allele_obj = json.loads(json_obj_str) + if allele_obj["allele_name"] in allele_dict: + allele_dict[allele_obj["allele_name"]].append(json_obj_str) + else: + allele_dict[allele_obj["allele_name"]] = [json_obj_str] + + for original_allele_name, allele_json_list in allele_dict.items(): + allele_name = original_allele_name.replace("/", "_") + allele_dir_path = f"{gene_dir_path}/{allele_name}" + os.makedirs(allele_dir_path, exist_ok=True) + file_to_be_generated_path = f"{allele_dir_path}/es_cell.json" + with open(file_to_be_generated_path, "w") as allele_file: + allele_file.write(f"[{','.join(allele_json_list)}]") + shutil.rmtree(input_path) + print("Finished") + + chain(process_allele_escell_data(), process_temp_folder()) +impc_spa_gene_allele_escell_mapper() \ No newline at end of file From 1661b56fd90eb49721fa3ae259d413024e1ee064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Fri, 7 Nov 2025 16:23:34 +0000 Subject: [PATCH 38/51] WIP: gene allele tvp mapper --- .../impc_spa/impc_gene_allele_tvp_mapper.py | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py new file mode 100644 index 00000000..f94fe885 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py @@ -0,0 +1,99 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, dag, chain + +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +product_report_parquet_asset = create_input_asset("output/product_report_parquet") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") + +@dag( + schedule=[product_report_parquet_asset], + dag_id=f"{dr_tag}_impc_spa_gene_tvp_mapper", + description=textwrap.dedent( + """IMPC SPA gene allele targeting vector mapper DAG.""" + ), + tags=["impc_spa", "allele", "tvp"], +) +def impc_spa_gene_allele_tvp_mapper(): + @with_spark_session + @task + def process_allele_tvp_data(): + import json + from pyspark.sql import SparkSession + from pyspark.sql.functions import col, regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + + product_df = spark.read.parquet(product_report_parquet_asset.uri) + product_df = product_df.filter(col("type") == "targeting_vector") + product_df = product_df.withColumnRenamed("mgi_accession_id", "mgiGeneAccessionId") + + result_df = chromosome_map_df.join(product_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(col("allele_name").isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + + (result_df + .repartition("mgiGeneAccessionId") + .write + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/allele_tvp_temp_json") + ) + + @task + def process_temp_folder(): + import json + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/allele_tvp_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + + allele_dict = {} + # JSON file can have multiple objects for distinct alleles + allele_data = open(file_path, 'r') + for json_obj_str in allele_data.readlines(): + allele_obj = json.loads(json_obj_str) + if allele_obj["allele_name"] in allele_dict: + allele_dict[allele_obj["allele_name"]].append(json_obj_str) + else: + allele_dict[allele_obj["allele_name"]] = [json_obj_str] + + for original_allele_name, allele_json_list in allele_dict.items(): + allele_name = original_allele_name.replace("/", "_") + allele_dir_path = f"{gene_dir_path}/{allele_name}" + os.makedirs(allele_dir_path, exist_ok=True) + file_to_be_generated_path = f"{allele_dir_path}/es_cell.json" + with open(file_to_be_generated_path, "w") as allele_file: + allele_file.write(f"[{','.join(allele_json_list)}]") + shutil.rmtree(input_path) + print("Finished") + + chain(process_allele_tvp_data(), process_temp_folder()) +impc_spa_gene_allele_tvp_mapper() \ No newline at end of file From f5e55dcb281d0efe181232deeeb9e8929d4cba49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 10 Nov 2025 11:13:33 +0000 Subject: [PATCH 39/51] fix: set correct filename in gene allele tvp mapper --- impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py index f94fe885..9ae4a6e1 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py @@ -89,7 +89,7 @@ def process_temp_folder(): allele_name = original_allele_name.replace("/", "_") allele_dir_path = f"{gene_dir_path}/{allele_name}" os.makedirs(allele_dir_path, exist_ok=True) - file_to_be_generated_path = f"{allele_dir_path}/es_cell.json" + file_to_be_generated_path = f"{allele_dir_path}/tvp.json" with open(file_to_be_generated_path, "w") as allele_file: allele_file.write(f"[{','.join(allele_json_list)}]") shutil.rmtree(input_path) From 64a7184a00917b73afa007d9497515a93c905cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 10 Nov 2025 11:32:42 +0000 Subject: [PATCH 40/51] feat: gene allele ivp mapper task --- .../impc_spa/impc_gene_allele_ivp_mapper.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_allele_ivp_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_ivp_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_ivp_mapper.py new file mode 100644 index 00000000..e12931d9 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_ivp_mapper.py @@ -0,0 +1,100 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, dag, chain + +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +product_report_parquet_asset = create_input_asset("output/product_report_parquet") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") + +@dag( + schedule=[product_report_parquet_asset], + dag_id=f"{dr_tag}_impc_spa_gene_ivp_mapper", + description=textwrap.dedent( + """IMPC SPA gene allele intermediate vector mapper DAG.""" + ), + tags=["impc_spa", "allele", "ivp"], +) +def impc_spa_gene_allele_ivp_mapper(): + @with_spark_session + @task + def process_allele_ivp_data(): + import json + from pyspark.sql import SparkSession + from pyspark.sql.functions import col, regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + + product_df = spark.read.parquet(product_report_parquet_asset.uri) + product_df = product_df.filter(col("type") == "intermediate_vector") + product_df = product_df.withColumnRenamed("mgi_accession_id", "mgiGeneAccessionId") + + result_df = chromosome_map_df.join(product_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(col("allele_name").isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + + print(result_df.count()) + (result_df + .repartition("mgiGeneAccessionId") + .write + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/allele_ivp_temp_json") + ) + + @task + def process_temp_folder(): + import json + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/allele_ivp_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + + allele_dict = {} + # JSON file can have multiple objects for distinct alleles + allele_data = open(file_path, 'r') + for json_obj_str in allele_data.readlines(): + allele_obj = json.loads(json_obj_str) + if allele_obj["allele_name"] in allele_dict: + allele_dict[allele_obj["allele_name"]].append(json_obj_str) + else: + allele_dict[allele_obj["allele_name"]] = [json_obj_str] + + for original_allele_name, allele_json_list in allele_dict.items(): + allele_name = original_allele_name.replace("/", "_") + allele_dir_path = f"{gene_dir_path}/{allele_name}" + os.makedirs(allele_dir_path, exist_ok=True) + file_to_be_generated_path = f"{allele_dir_path}/ivp.json" + with open(file_to_be_generated_path, "w") as allele_file: + allele_file.write(f"[{','.join(allele_json_list)}]") + shutil.rmtree(input_path) + print("Finished") + + chain(process_allele_ivp_data(), process_temp_folder()) +impc_spa_gene_allele_ivp_mapper() \ No newline at end of file From 5809efff04692695184ce1205f60d1f47e920985 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 10 Nov 2025 11:44:52 +0000 Subject: [PATCH 41/51] feat: add gentar crispr report asset to data_ingestion --- impc_etl/jobs/ingest/data_ingestion.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/impc_etl/jobs/ingest/data_ingestion.py b/impc_etl/jobs/ingest/data_ingestion.py index 5310b105..17bb4b33 100644 --- a/impc_etl/jobs/ingest/data_ingestion.py +++ b/impc_etl/jobs/ingest/data_ingestion.py @@ -144,6 +144,32 @@ def copy_products_report(): return tracking_products_mice_file_asset +tracking_products_crispr_file_asset = create_input_asset( + "tracking/gentar-products_crispr-latest.json" +) + +@asset.multi( + schedule=[tracking_directory_asset], + outlets=[tracking_products_crispr_file_asset], + dag_id=f"{dr_tag}_copy_products_crispr_report", +) +def copy_products_report(): + """Gather tracking data from GenTar when tracking directory is available""" + source_file = f"{data_archive_path}/gentar-data-archive/product_reports/gentar-products_crispr-latest.json" + target_file = f"{input_data_path}/tracking/gentar-products_crispr.json" + + task_logger.info(f"Copying tracking data from {source_file} to {target_file}") + + # Ensure source file exists + if not os.path.exists(source_file): + raise FileNotFoundError(f"Source tracking file not found: {source_file}") + + # Copy the file + shutil.copy(source_file, target_file) + task_logger.info(f"Successfully copied tracking data to {target_file}") + + return tracking_products_mice_file_asset + gene_interest_asset = create_input_asset("tracking/gene_interest.tsv") gene_interest_json_asset = create_input_asset("tracking/gene_interest.json") From e061a1bc663ad364e3d00ac5d0e802d41b3ab502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 10 Nov 2025 15:31:04 +0000 Subject: [PATCH 42/51] feat: added gene allele crispr mapper Airflow task --- .../impc_gene_allele_crispr_mapper.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py new file mode 100644 index 00000000..636b333f --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py @@ -0,0 +1,105 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, dag, chain + +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +gentar_products_crispr_latest_json_output_asset = create_input_asset("tracking/gentar-products_crispr.json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") + +@dag( + schedule=[gentar_products_crispr_latest_json_output_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_allele_crispr_mapper", + description=textwrap.dedent( + """IMPC SPA gene allele crispr mapper DAG.""" + ), + tags=["impc_spa", "allele", "crispr"], +) +def impc_spa_gene_allele_crispr_mapper(): + @with_spark_session + @task + def process_allele_crispr_data(): + import json + from pyspark.sql import SparkSession + from pyspark.sql.functions import col, regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + + product_report_path = unquote(urlparse(gentar_products_crispr_latest_json_output_asset.uri).path) + file_data = open(product_report_path, 'r') + lines = filter(lambda line: line.replace("\n", ""), file_data.readlines()) + lines = filter(None, lines) + json_str = "[" + ",".join(lines) + "]" + product_report_json = json.loads(json_str) + product_crispr_df = spark.createDataFrame(product_report_json) + + result_df = chromosome_map_df.join(product_crispr_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(col("alleleSuperscript").isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .repartition("mgiGeneAccessionId") + .write + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/allele_crispr_temp_json") + ) + + print("Finished") + @task + def process_temp_folder(): + import json + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/allele_crispr_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + + allele_dict = {} + # JSON file can have multiple objects for distinct alleles + allele_data = open(file_path, 'r') + for json_obj_str in allele_data.readlines(): + allele_obj = json.loads(json_obj_str) + if allele_obj["alleleSuperscript"] in allele_dict: + allele_dict[allele_obj["alleleSuperscript"]].append(json_obj_str) + else: + allele_dict[allele_obj["alleleSuperscript"]] = [json_obj_str] + + + for original_allele_name, allele_json_list in allele_dict.items(): + allele_name = original_allele_name.replace("/", "_") + allele_dir_path = f"{gene_dir_path}/{allele_name}" + os.makedirs(allele_dir_path, exist_ok=True) + file_to_be_generated_path = f"{allele_dir_path}/crispr.json" + with open(file_to_be_generated_path, "w") as allele_file: + allele_file.write(f"[{','.join(allele_json_list)}]") + shutil.rmtree(input_path) + print("Finished") + + chain(process_allele_crispr_data(), process_temp_folder()) +impc_spa_gene_allele_crispr_mapper() From d0b148eb246becd76dfc26fd7224e4c6acf3ce96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 10 Nov 2025 16:14:22 +0000 Subject: [PATCH 43/51] feat: gentar crispr report loader moved into a function in impc_spa module --- .../impc_spa/impc_gene_allele_crispr_mapper.py | 9 ++------- impc_etl/utils/impc_spa.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py index 636b333f..703f34d7 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py @@ -3,6 +3,7 @@ from airflow.sdk import Variable, task, dag, chain from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.impc_spa import create_gentar_crispr_report_df from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -34,13 +35,7 @@ def process_allele_crispr_data(): chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) - product_report_path = unquote(urlparse(gentar_products_crispr_latest_json_output_asset.uri).path) - file_data = open(product_report_path, 'r') - lines = filter(lambda line: line.replace("\n", ""), file_data.readlines()) - lines = filter(None, lines) - json_str = "[" + ",".join(lines) + "]" - product_report_json = json.loads(json_str) - product_crispr_df = spark.createDataFrame(product_report_json) + product_crispr_df = create_gentar_crispr_report_df(spark, gentar_products_crispr_latest_json_output_asset.uri) result_df = chromosome_map_df.join(product_crispr_df, "mgiGeneAccessionId", "left_outer") result_df = result_df.filter(col("alleleSuperscript").isNotNull()) diff --git a/impc_etl/utils/impc_spa.py b/impc_etl/utils/impc_spa.py index b8f92bb9..064e70ac 100644 --- a/impc_etl/utils/impc_spa.py +++ b/impc_etl/utils/impc_spa.py @@ -72,4 +72,16 @@ def process_temp_folder_into_files( with open(file_to_be_generated_path, "w") as gene_file: gene_file.write(generate_valid_json_from_file(file_path)) outlet_events[asset_alias].add(Asset(f"file://{file_to_be_generated_path}")) - shutil.rmtree(input_path) \ No newline at end of file + shutil.rmtree(input_path) + +def create_gentar_crispr_report_df( + spark, + asset_path: str, +): + product_report_path = unquote(urlparse(asset_path).path) + file_data = open(product_report_path, 'r') + lines = filter(lambda line: line.replace("\n", ""), file_data.readlines()) + lines = filter(None, lines) + json_str = "[" + ",".join(lines) + "]" + product_report_json = json.loads(json_str) + return spark.createDataFrame(product_report_json) \ No newline at end of file From 0d3e3059bf827a090f4b6071f83f9dc88abf0dca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 13 Nov 2025 16:58:32 +0000 Subject: [PATCH 44/51] feat: added histopathology datasets mapper Airflow task --- ...mpc_gene_histopathology_datasets_mapper.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_histopathology_datasets_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_datasets_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_datasets_mapper.py new file mode 100644 index 00000000..2e2518c3 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_histopathology_datasets_mapper.py @@ -0,0 +1,51 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +histopathology_service_json_asset = create_input_asset("output/impc_web_api/histopathology_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_histopathology_asset = AssetAlias("impc_spa_gene_histopathology") + +@dag( + schedule=[histopathology_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_histopathology_datasets_mapper", + description=textwrap.dedent( + """IMPC SPA gene histopathology datasets mapper DAG.""" + ), + tags=["impc_spa", "gene", "histopathology", "datasets"], +) +def impc_spa_gene_histopathology_datasets_mapper(): + @with_spark_session + @task + def process_gene_histopathology_datasets(): + from pyspark.sql import SparkSession + + spark = SparkSession.builder.getOrCreate() + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=histopathology_service_json_asset.uri, + col_to_filter="datasets", + temp_folder_path="histopathology_datasets_temp_json" + ) + print("Finished") + + @task(outlets=[gene_histopathology_asset]) + def process_temp_folder(*, outlet_events): + process_temp_folder_into_files( + chromosome_map_path=chromosome_map_json_asset.uri, + temp_folder_path="histopathology_datasets_temp_json", + file_name="full-histopathology.json", + asset_alias=gene_histopathology_asset, + outlet_events=outlet_events + ) + print("Finished") + + chain(process_gene_histopathology_datasets(), process_temp_folder()) +impc_spa_gene_histopathology_datasets_mapper() From 7c0640550e420077f13bc2ca931f77ca04099284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Wed, 19 Nov 2025 11:09:57 +0000 Subject: [PATCH 45/51] feat: update Allele tasks to write files in correct location --- impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py | 4 +++- impc_etl/jobs/load/impc_spa/impc_gene_allele_escell_mapper.py | 4 +++- impc_etl/jobs/load/impc_spa/impc_gene_allele_ivp_mapper.py | 4 +++- impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py | 4 +++- impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py | 4 +++- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py index 703f34d7..d64cd3d6 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_crispr_mapper.py @@ -86,9 +86,11 @@ def process_temp_folder(): allele_dict[allele_obj["alleleSuperscript"]] = [json_obj_str] + general_alleles_dir_path = f"{gene_dir_path}/alleles" + os.makedirs(general_alleles_dir_path, exist_ok=True) for original_allele_name, allele_json_list in allele_dict.items(): allele_name = original_allele_name.replace("/", "_") - allele_dir_path = f"{gene_dir_path}/{allele_name}" + allele_dir_path = f"{general_alleles_dir_path}/{allele_name}" os.makedirs(allele_dir_path, exist_ok=True) file_to_be_generated_path = f"{allele_dir_path}/crispr.json" with open(file_to_be_generated_path, "w") as allele_file: diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_escell_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_escell_mapper.py index fb7563df..5bab3e32 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_allele_escell_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_escell_mapper.py @@ -85,9 +85,11 @@ def process_temp_folder(): else: allele_dict[allele_obj["allele_name"]] = [json_obj_str] + general_alleles_dir_path = f"{gene_dir_path}/alleles" + os.makedirs(general_alleles_dir_path, exist_ok=True) for original_allele_name, allele_json_list in allele_dict.items(): allele_name = original_allele_name.replace("/", "_") - allele_dir_path = f"{gene_dir_path}/{allele_name}" + allele_dir_path = f"{general_alleles_dir_path}/{allele_name}" os.makedirs(allele_dir_path, exist_ok=True) file_to_be_generated_path = f"{allele_dir_path}/es_cell.json" with open(file_to_be_generated_path, "w") as allele_file: diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_ivp_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_ivp_mapper.py index e12931d9..714c67f8 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_allele_ivp_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_ivp_mapper.py @@ -86,9 +86,11 @@ def process_temp_folder(): else: allele_dict[allele_obj["allele_name"]] = [json_obj_str] + general_alleles_dir_path = f"{gene_dir_path}/alleles" + os.makedirs(general_alleles_dir_path, exist_ok=True) for original_allele_name, allele_json_list in allele_dict.items(): allele_name = original_allele_name.replace("/", "_") - allele_dir_path = f"{gene_dir_path}/{allele_name}" + allele_dir_path = f"{general_alleles_dir_path}/{allele_name}" os.makedirs(allele_dir_path, exist_ok=True) file_to_be_generated_path = f"{allele_dir_path}/ivp.json" with open(file_to_be_generated_path, "w") as allele_file: diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py index d1363770..f4276bb9 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_mice_mapper.py @@ -72,9 +72,11 @@ def process_temp_folder(): allele_dict[allele_obj["alleleName"]] = [json_obj_str] + general_alleles_dir_path = f"{gene_dir_path}/alleles" + os.makedirs(general_alleles_dir_path, exist_ok=True) for original_allele_name, allele_json_list in allele_dict.items(): allele_name = original_allele_name.replace("/", "_") - allele_dir_path = f"{gene_dir_path}/{allele_name}" + allele_dir_path = f"{general_alleles_dir_path}/{allele_name}" os.makedirs(allele_dir_path, exist_ok=True) file_to_be_generated_path = f"{allele_dir_path}/mice.json" with open(file_to_be_generated_path, "w") as allele_file: diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py index 9ae4a6e1..7bbff311 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_tvp_mapper.py @@ -85,9 +85,11 @@ def process_temp_folder(): else: allele_dict[allele_obj["allele_name"]] = [json_obj_str] + general_alleles_dir_path = f"{gene_dir_path}/alleles" + os.makedirs(general_alleles_dir_path, exist_ok=True) for original_allele_name, allele_json_list in allele_dict.items(): allele_name = original_allele_name.replace("/", "_") - allele_dir_path = f"{gene_dir_path}/{allele_name}" + allele_dir_path = f"{general_alleles_dir_path}/{allele_name}" os.makedirs(allele_dir_path, exist_ok=True) file_to_be_generated_path = f"{allele_dir_path}/tvp.json" with open(file_to_be_generated_path, "w") as allele_file: From 341d868db66b080b50405c33ef8983387b6830bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 20 Nov 2025 12:50:13 +0000 Subject: [PATCH 46/51] WIP: airflow task to generate allele summary json files --- .../impc_gene_allele_summary_mapper.py | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_allele_summary_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_allele_summary_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_allele_summary_mapper.py new file mode 100644 index 00000000..1432d1c0 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_allele_summary_mapper.py @@ -0,0 +1,199 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, dag, chain +from impc_etl.utils.impc_spa import write_partitioned_data, process_temp_folder_into_files, \ + create_gentar_crispr_report_df +from impc_etl.utils.spark import with_spark_session +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +order_report_json_path_asset = create_input_asset("tracking/gentar-products_order.json") +gentar_products_crispr_latest_json_output_asset = create_input_asset("tracking/gentar-products_crispr.json") +gentar_products_mice_latest_json_output_asset = create_input_asset("output/impc_web_api/gentar-products_mice-latest.json") +product_report_parquet_asset = create_input_asset("output/product_report_parquet") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") + +@dag( + schedule=[ + order_report_json_path_asset, + gentar_products_crispr_latest_json_output_asset, + gentar_products_mice_latest_json_output_asset, + product_report_parquet_asset, + chromosome_map_json_asset + ], + dag_id=f"{dr_tag}_impc_spa_gene_allele_summary_mapper", + description=textwrap.dedent( + """IMPC SPA gene allele summary mapper DAG.""" + ), + tags=["impc_spa", "allele", "summary"], +) +def impc_spa_gene_order_mapper(): + @with_spark_session + @task + def process_reports(): + import json + from pyspark.sql import SparkSession + from pyspark.sql.functions import col, concat, lit, regexp_replace, when + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + order_report_df = spark.read.json(order_report_json_path_asset.uri) + general_product_df = spark.read.parquet(product_report_parquet_asset.uri) + product_crispr_df = create_gentar_crispr_report_df(spark, gentar_products_crispr_latest_json_output_asset.uri) + product_mice_df = spark.read.json(gentar_products_mice_latest_json_output_asset.uri) + + order_report_df = order_report_df.withColumnRenamed("alleleSuperscript", "alleleName") + product_crispr_df = product_crispr_df.withColumnRenamed("alleleSuperscript", "alleleName") + + product_crispr_df = product_crispr_df.select( + "alleleSymbol", + lit(True).alias("doesCrisprProductsExist") + ).dropDuplicates() + + product_mice_df = product_mice_df.withColumn( + "alleleSymbol", + concat(col("geneSymbol"), lit("<"), col("alleleName"), lit(">")) + ) + product_mice_df = product_mice_df.select( + "alleleSymbol", + lit(True).alias("doesMiceProductsExist") + ).dropDuplicates() + + general_product_df = general_product_df.withColumnsRenamed({ + "allele_name": "alleleName", + "marker_symbol": "geneSymbol", + "mgi_accession_id": "mgiGeneAccessionId" + }) + + general_product_df = general_product_df.withColumn( + "alleleSymbol", + concat(col("geneSymbol"), lit("<"), col("alleleName"), lit(">")) + ) + + general_product_df = general_product_df.drop("geneSymbol") + es_cell_product_df = general_product_df.where(col("type") == "es_cell") + es_cell_product_df = es_cell_product_df.select( + "alleleSymbol", + lit(True).alias("doesEsCellProductsExist") + ) + + tvp_product_df = general_product_df.where(col("type") == "targeting_vector") + tvp_product_df = tvp_product_df.select( + "alleleSymbol", + lit(True).alias("doesTargetingVectorProductsExist") + ) + + ivp_product_df = general_product_df.where(col("type") == "targeting_vector") + ivp_product_df = ivp_product_df.select( + "alleleSymbol", + lit(True).alias("doesIntermediateVectorProductsExist") + ) + + result_df = order_report_df.join( + product_crispr_df, + "alleleSymbol", + "left_outer" + ) + result_df = result_df.withColumn( + "doesCrisprProductsExist", + when(col("doesCrisprProductsExist").isNotNull(), lit(True)).otherwise(lit(False)), + ) + result_df = result_df.join( + product_mice_df, + "alleleSymbol", + "left_outer" + ) + result_df = result_df.withColumn( + "doesMiceProductsExist", + when(col("doesMiceProductsExist").isNotNull(), lit(True)).otherwise(lit(False)), + ) + result_df = result_df.join( + es_cell_product_df, + "alleleSymbol", + "left_outer" + ) + result_df = result_df.withColumn( + "doesEsCellProductsExist", + when(col("doesEsCellProductsExist").isNotNull(), lit(True)).otherwise(lit(False)), + ) + result_df = result_df.join( + tvp_product_df, + "alleleSymbol", + "left_outer" + ) + result_df = result_df.withColumn( + "doesTargetingVectorProductsExist", + when(col("doesTargetingVectorProductsExist").isNotNull(), lit(True)).otherwise(lit(False)), + ) + result_df = result_df.join( + ivp_product_df, + "alleleSymbol", + "left_outer" + ) + result_df = result_df.withColumn( + "doesIntermediateVectorProductsExist", + when(col("doesIntermediateVectorProductsExist").isNotNull(), lit(True)).otherwise(lit(False)), + ) + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + result_df = chromosome_map_df.join(result_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(col("alleleName").isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .repartition("mgiGeneAccessionId") + .write + .option("header", True) + .mode("overwrite") + .partitionBy("mgiGeneAccessionId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/allele_summary_temp_json") + ) + print("Finished") + @task + def process_temp_folder(): + import json + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/allele_summary_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + + allele_data = open(file_path, 'r') + for json_obj_str in allele_data.readlines(): + allele_obj = json.loads(json_obj_str) + allele_obj["geneSymbol"] = allele_obj["alleleSymbol"].split("<")[0] + + allele_name = allele_obj["alleleName"].replace("/", "_") + allele_dir_path = f"{gene_dir_path}/{allele_name}" + os.makedirs(allele_dir_path, exist_ok=True) + file_to_be_generated_path = f"{allele_dir_path}/summary.json" + with open(file_to_be_generated_path, "w") as allele_file: + allele_file.write(f"[{','.join(allele_obj)}]") + shutil.rmtree(input_path) + + + chain(process_reports(), process_temp_folder()) +impc_spa_gene_order_mapper() \ No newline at end of file From e1ff050b2f179495ae0d12913b382b0654407eb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 20 Nov 2025 14:54:33 +0000 Subject: [PATCH 47/51] feat: airflow task to generate image dir structure and control/mutant json files --- .../jobs/load/impc_spa/impc_images_mapper.py | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_images_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_images_mapper.py b/impc_etl/jobs/load/impc_spa/impc_images_mapper.py new file mode 100644 index 00000000..e2eafd3f --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_images_mapper.py @@ -0,0 +1,129 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.impc_spa import generate_valid_json_from_file +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +images_service_json_asset = create_input_asset("output/impc_web_api/images_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +images_asset = AssetAlias("impc_spa_images") + +@dag( + schedule=[images_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_images_mapper", + description=textwrap.dedent( + """IMPC SPA images mapper DAG.""" + ), + tags=["impc_spa", "images"], +) +def impc_spa_images_mapper(): + @with_spark_session + @task + def process_images_parquet(): + import json + from pyspark.sql import SparkSession + from pyspark.sql.functions import col, regexp_replace + from urllib.parse import unquote, urlparse + + spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + parquet_df = spark.read.json(images_service_json_asset.uri) + control_images_df = parquet_df.filter(col("biologicalSampleGroup") == "control") + + result_df = chromosome_map_df.join(parquet_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(col("biologicalSamplegroup").isNotNull()) + result_df = result_df.filter(col("biologicalSampleGroup") == "experimental") + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + (result_df + .repartition("mgiGeneAccessionId") + .write + .mode("overwrite") + .partitionBy("mgiGeneAccessionId", "parameterStableId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/experimental_images_temp_json") + ) + (control_images_df + .repartition("parameterStableId") + .write + .mode("overwrite") + .partitionBy("parameterStableId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/control_images_temp_json") + ) + print("Finished") + + @task + def process_control_folder(): + from glob import iglob + import os + import shutil + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/control_images_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa/control_images_by_parameter" + os.makedirs(output_path, exist_ok=True) + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + parameter_stable_id = parent_dir.split("=")[1] + file_to_be_generated_path = f"{output_path}/{parameter_stable_id}.json" + with open(file_to_be_generated_path, "w") as output_file: + output_file.write(generate_valid_json_from_file(file_path)) + shutil.rmtree(input_path) + print("Finished") + + @task + def process_experimental_folder(): + from glob import iglob + from urllib.parse import unquote, urlparse + import json + import os + import shutil + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + control_images_path = f"{get_data_release_work_dir()}/output/impc_spa/control_images_by_parameter" + input_path = f"{get_data_release_work_dir()}/output/impc_spa/experimental_images_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + + + for file_path in iglob(f"{input_path}/**/*.json", recursive=True): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + parameter_stable_id = parent_dir.split("=")[1] + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + general_gene_images_path = f"{gene_dir_path}/images" + os.makedirs(general_gene_images_path, exist_ok=True) + parameter_images_path = f"{general_gene_images_path}/{parameter_stable_id}" + os.makedirs(parameter_images_path, exist_ok=True) + # writing experimental images + file_to_be_generated_path = f"{parameter_images_path}/mutant.json" + with open(file_to_be_generated_path, "w") as gene_file: + gene_file.write(generate_valid_json_from_file(file_path)) + # copying control images to gene dir + control_images_file_path = f"{control_images_path}/{parameter_stable_id}.json" + if os.path.exists(control_images_file_path): + shutil.copy(control_images_file_path, f"{parameter_images_path}/control.json") + shutil.rmtree(input_path) + shutil.rmtree(control_images_path) + print("Finished") + + chain(process_images_parquet(), process_control_folder(), process_experimental_folder()) +impc_spa_images_mapper() From 7bf6bce016b617b0b9e19cba63152df57c935e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Mon, 24 Nov 2025 11:57:23 +0000 Subject: [PATCH 48/51] feat: add Airflow task for gene significant phenotypes --- ...impc_gene_significant_phenotypes_mapper.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_significant_phenotypes_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_significant_phenotypes_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_significant_phenotypes_mapper.py new file mode 100644 index 00000000..85217f4f --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_significant_phenotypes_mapper.py @@ -0,0 +1,83 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.impc_spa import write_partitioned_data +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +datasets_metadata_service_json_asset = create_input_asset("output/impc_web_api/datasets_metadata_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_sig_phenotypes_asset = AssetAlias("impc_spa_gene_sig_phenotypes") + +@dag( + schedule=[datasets_metadata_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_gene_sig_phenotypes_mapper", + description=textwrap.dedent( + """IMPC SPA gene significant phenotypes mapper DAG.""" + ), + tags=["impc_spa", "gene", "significant phenotypes"], +) +def impc_spa_gene_significant_phenotypes_mapper(): + @with_spark_session + @task + def process_parquet(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import col + + def filter_dataframe(df): + return df.where(col("significant") == "true") + + spark = SparkSession.builder.getOrCreate() + write_partitioned_data( + spark, + chromosome_map_path=chromosome_map_json_asset.uri, + parquet_path=datasets_metadata_service_json_asset.uri, + col_to_filter="significantPhenotype", + temp_folder_path="sig_phenotypes_temp_json", + filtered_dataframe_fn=filter_dataframe + ) + print("Finished") + + @task(outlets=[gene_sig_phenotypes_asset]) + def process_temp_folder(*, outlet_events): + from urllib.parse import unquote, urlparse + from glob import iglob + import json + import os + import shutil + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/sig_phenotypes_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json"): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + general_phenotypes_dir_path = f"{gene_dir_path}/significant_phenotypes" + os.makedirs(general_phenotypes_dir_path, exist_ok=True) + json_file = open(file_path, 'r') + for json_obj_str in json_file.readlines(): + sig_phenotype_obj = json.loads(json_obj_str) + sig_phenotype_id = sig_phenotype_obj["significantPhenotype"]["id"].replace(":", "_") + file_to_be_generated_path = f"{general_phenotypes_dir_path}/{sig_phenotype_id}.json" + with open(file_to_be_generated_path, "w") as phenotype_file: + phenotype_file.write(json_obj_str) + shutil.rmtree(input_path) + print("Finished") + + chain(process_parquet(), process_temp_folder()) +impc_spa_gene_significant_phenotypes_mapper() From dd8500184ab0135a5d261f67b9cb33fa71a553dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 4 Dec 2025 13:46:58 +0000 Subject: [PATCH 49/51] WIP: all phenotype data mapper task --- .../impc_gene_all_phenotype_data_mapper.py | 40 +++++++++++++++++++ impc_etl/utils/impc_spa.py | 2 +- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 impc_etl/jobs/load/impc_spa/impc_gene_all_phenotype_data_mapper.py diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_all_phenotype_data_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_all_phenotype_data_mapper.py new file mode 100644 index 00000000..b1891124 --- /dev/null +++ b/impc_etl/jobs/load/impc_spa/impc_gene_all_phenotype_data_mapper.py @@ -0,0 +1,40 @@ +import logging +import textwrap +from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset +from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.impc_spa import write_partitioned_data +from impc_etl.utils.spark import with_spark_session + +task_logger = logging.getLogger("airflow.task") +dr_tag = Variable.get("data_release_tag") + +datasets_metadata_service_json_asset = create_input_asset("output/impc_web_api/datasets_metadata_service_json") +chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") +gene_sig_phenotypes_asset = AssetAlias("impc_spa_gene_sig_phenotypes") + +@dag( + schedule=[datasets_metadata_service_json_asset, chromosome_map_json_asset], + dag_id=f"{dr_tag}_impc_spa_all_phenotype_data_mapper", + description=textwrap.dedent( + """IMPC SPA gene all phenotype data mapper DAG.""" + ), + tags=["impc_spa", "gene", "all phenotype data"], +) +def impc_spa_gene_all_phenotype_data_mapper(): + @with_spark_session + @task + def process_parquet(): + from pyspark.sql import SparkSession + from pyspark.sql.functions import col + + spark = SparkSession.builder.getOrCreate() + dataset_df = spark.read.json(datasets_metadata_service_json_asset.uri) + + print("Finished") + + @task(outlets=[gene_sig_phenotypes_asset]) + def process_temp_folder(*, outlet_events): + print("Finished") + + chain(process_parquet(), process_temp_folder()) +impc_spa_gene_all_phenotype_data_mapper() diff --git a/impc_etl/utils/impc_spa.py b/impc_etl/utils/impc_spa.py index 064e70ac..b41a4c55 100644 --- a/impc_etl/utils/impc_spa.py +++ b/impc_etl/utils/impc_spa.py @@ -71,7 +71,7 @@ def process_temp_folder_into_files( file_to_be_generated_path = f"{gene_dir_path}/{file_name}" with open(file_to_be_generated_path, "w") as gene_file: gene_file.write(generate_valid_json_from_file(file_path)) - outlet_events[asset_alias].add(Asset(f"file://{file_to_be_generated_path}")) + # outlet_events[asset_alias].add(Asset(f"file://{file_to_be_generated_path}")) shutil.rmtree(input_path) def create_gentar_crispr_report_df( From 623890cec4fe9b64c640cdf14670c76cdd4ddf3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Thu, 4 Dec 2025 17:30:46 +0000 Subject: [PATCH 50/51] feat: initial all phenotype data mapper task --- .../impc_gene_all_phenotype_data_mapper.py | 64 +++++++++++++++++-- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_gene_all_phenotype_data_mapper.py b/impc_etl/jobs/load/impc_spa/impc_gene_all_phenotype_data_mapper.py index b1891124..c18b0e15 100644 --- a/impc_etl/jobs/load/impc_spa/impc_gene_all_phenotype_data_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_gene_all_phenotype_data_mapper.py @@ -2,7 +2,7 @@ import textwrap from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir -from impc_etl.utils.impc_spa import write_partitioned_data +from impc_etl.utils.impc_spa import write_partitioned_data, generate_valid_json_from_file from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -10,7 +10,7 @@ datasets_metadata_service_json_asset = create_input_asset("output/impc_web_api/datasets_metadata_service_json") chromosome_map_json_asset = create_input_asset("output/impc_spa/chromosome-map.json") -gene_sig_phenotypes_asset = AssetAlias("impc_spa_gene_sig_phenotypes") +gene_all_phenotype_data_asset = AssetAlias("impc_spa_gene_all_phenotype_data") @dag( schedule=[datasets_metadata_service_json_asset, chromosome_map_json_asset], @@ -25,15 +25,71 @@ def impc_spa_gene_all_phenotype_data_mapper(): @task def process_parquet(): from pyspark.sql import SparkSession - from pyspark.sql.functions import col + from pyspark.sql.functions import col, regexp_replace + from urllib.parse import unquote, urlparse + import json spark = SparkSession.builder.getOrCreate() + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + chromosome_map_df = spark.createDataFrame(chromosome_map_json.items(), ["mgiGeneAccessionId", "chromosome"]) + dataset_df = spark.read.json(datasets_metadata_service_json_asset.uri) + result_df = chromosome_map_df.join(dataset_df, "mgiGeneAccessionId", "left_outer") + result_df = result_df.filter(col("pipelineStableId").isNotNull()) + result_df = result_df.drop("chromosome") + result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) + + (result_df + .repartition("mgiGeneAccessionId") + .write + .mode("overwrite") + .partitionBy("mgiGeneAccessionId", "pipelineStableId", "procedureStableId") + .json(f"{get_data_release_work_dir()}/output/impc_spa/all_ph_data_temp_json") + ) print("Finished") - @task(outlets=[gene_sig_phenotypes_asset]) + @task(outlets=[gene_all_phenotype_data_asset]) def process_temp_folder(*, outlet_events): + import json + import os + import shutil + from glob import iglob + from urllib.parse import unquote, urlparse + + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/all_ph_data_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json", recursive=True): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + procedure_stable_id = parent_dir.split("=")[1] + parent_dir = filepath_parts.pop() + pipeline_stable_id = parent_dir.split("=")[1] + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + general_gene_images_path = f"{gene_dir_path}/pipeline" + os.makedirs(general_gene_images_path, exist_ok=True) + pipeline_dir_path = f"{general_gene_images_path}/{pipeline_stable_id}" + os.makedirs(pipeline_dir_path, exist_ok=True) + # write data + file_to_be_generated_path = f"{pipeline_dir_path}/{procedure_stable_id}.json" + with open(file_to_be_generated_path, "w") as dataset_file: + dataset_file.write(generate_valid_json_from_file(file_path)) + shutil.rmtree(input_path) print("Finished") chain(process_parquet(), process_temp_folder()) From 37e063031082773a9fa25ec7e36a32f016ddcfd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Pen=CC=83a?= Date: Fri, 5 Dec 2025 17:20:25 +0000 Subject: [PATCH 51/51] feat: diseases mapper task --- .../load/impc_spa/impc_diseases_mapper.py | 59 +++++++++---------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py b/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py index 529e99d3..4ef1df56 100644 --- a/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py +++ b/impc_etl/jobs/load/impc_spa/impc_diseases_mapper.py @@ -3,6 +3,7 @@ import textwrap from airflow.sdk import Variable, task, AssetAlias, chain, dag, Asset from impc_etl.utils.airflow import create_input_asset, get_data_release_work_dir +from impc_etl.utils.impc_spa import generate_valid_json_from_file from impc_etl.utils.spark import with_spark_session task_logger = logging.getLogger("airflow.task") @@ -25,7 +26,7 @@ def impc_spa_gene_diseases_mapper(): @task def process_gene_diseases(): from pyspark.sql import SparkSession - from pyspark.sql.functions import regexp_replace + from pyspark.sql.functions import col, regexp_replace from urllib.parse import unquote, urlparse spark = SparkSession.builder.getOrCreate() @@ -37,10 +38,11 @@ def process_gene_diseases(): gene_diseases_df = spark.read.json(gene_diseases_service_json_path) result_df = chromosome_map_df.join(gene_diseases_df, "mgiGeneAccessionId", "left_outer") result_df.show() - # result_df = result_df.filter(result_df.href.isNotNull()) + result_df = result_df.filter(col("associationCurated").isNotNull()) result_df = result_df.drop("chromosome") result_df = result_df.withColumn("mgiGeneAccessionId", regexp_replace("mgiGeneAccessionId", ":", "_")) (result_df + .repartition("mgiGeneAccessionId") .write .option("header", True) .mode("overwrite") @@ -56,34 +58,31 @@ def process_temp_folder(*, outlet_events): from glob import iglob from urllib.parse import unquote, urlparse - # def generate_valid_json_from_file(file_path): - # file_data = open(file_path, 'r') - # lines = file_data.readlines() - # return f"[{','.join(lines)}]" - # - # chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) - # chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) - # - # input_path = f"{get_data_release_work_dir()}/output/impc_spa/external_links_temp_json" - # output_path = f"{get_data_release_work_dir()}/output/impc_spa" - # for file_path in iglob(f"{input_path}/**/*.json"): - # filepath_parts = file_path.split("/") - # filepath_parts.pop() - # parent_dir = filepath_parts.pop() - # mgi_gene_accession_id = parent_dir.split("=")[1] - # original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") - # - # chromosome = chromosome_map_json[original_mgi_gene_accession_id] - # chromosome_folder = f"{output_path}/{chromosome}" - # os.makedirs(chromosome_folder, exist_ok=True) - # - # gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" - # gene_external_links_path = f"{gene_dir_path}/external-links.json" - # with open(gene_external_links_path, "w") as gene_file: - # gene_file.write(generate_valid_json_from_file(file_path)) - # outlet_events[gene_diseases_asset].add(Asset(f"file://{gene_external_links_path}")) - # - # shutil.rmtree(input_path) + chromosome_map_json_path = unquote(urlparse(chromosome_map_json_asset.uri).path) + chromosome_map_json = json.loads(open(chromosome_map_json_path).read()) + + input_path = f"{get_data_release_work_dir()}/output/impc_spa/gene_diseases_temp_json" + output_path = f"{get_data_release_work_dir()}/output/impc_spa" + for file_path in iglob(f"{input_path}/**/*.json", recursive=True): + filepath_parts = file_path.split("/") + filepath_parts.pop() + parent_dir = filepath_parts.pop() + association_status = parent_dir.split("=")[1] + parent_dir = filepath_parts.pop() + mgi_gene_accession_id = parent_dir.split("=")[1] + original_mgi_gene_accession_id = mgi_gene_accession_id.replace("_", ":") + + chromosome = chromosome_map_json[original_mgi_gene_accession_id] + chromosome_folder = f"{output_path}/{chromosome}" + os.makedirs(chromosome_folder, exist_ok=True) + + gene_dir_path = f"{chromosome_folder}/{mgi_gene_accession_id}" + os.makedirs(gene_dir_path, exist_ok=True) + file_name = 'associated-diseases' if association_status == "true" else 'predicted-diseases' + gene_external_links_path = f"{gene_dir_path}/{file_name}.json" + with open(gene_external_links_path, "w") as gene_file: + gene_file.write(generate_valid_json_from_file(file_path)) + shutil.rmtree(input_path) print("Finished") chain(process_gene_diseases(), process_temp_folder())