diff --git a/.github/workflows/external-indices.yml b/.github/workflows/external-indices.yml index fee3603..86de6ba 100644 --- a/.github/workflows/external-indices.yml +++ b/.github/workflows/external-indices.yml @@ -55,6 +55,6 @@ jobs: if: github.event_name == 'release' && github.event.action == 'published' uses: ncipollo/release-action@v1 with: - artifacts: "release_artifacts/*.parquet,release_artifacts/*.json" + artifacts: "release_artifacts/*.parquet,release_artifacts/*.json,release_artifacts/*.sql" allowUpdates: true omitBodyDuringUpdate: true diff --git a/CMakeLists.txt b/CMakeLists.txt index 413c530..02699e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,13 +28,13 @@ add_custom_command( add_custom_target(run_idc_index_data_manager ALL DEPENDS $<$:${download_dir}/idc_index.csv.zip> - $<$:${download_dir}/idc_index.parquet> - $<$:${download_dir}/prior_versions_index.parquet> + $<$:${download_dir}/release_artifacts/idc_index.parquet> + $<$:${download_dir}/release_artifacts/prior_versions_index.parquet> ) install( FILES $<$:${download_dir}/idc_index.csv.zip> - $<$:${download_dir}/idc_index.parquet> - $<$:${download_dir}/prior_versions_index.parquet> + $<$:${download_dir}/release_artifacts/idc_index.parquet> + $<$:${download_dir}/release_artifacts/prior_versions_index.parquet> DESTINATION "idc_index_data") diff --git a/assets/README.md b/assets/README.md deleted file mode 100644 index b72c27c..0000000 --- a/assets/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This folder contains SQL scripts that are used to generate tables that are -attached to the releases as assets. Initially, those will be generated and -attached manually, but in the future this process may be automated. diff --git a/pyproject.toml b/pyproject.toml index b67eb04..1ea044f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build" [project] name = "idc-index-data" -version = "22.1.1" +version = "22.1.2" authors = [ { name = "Andrey Fedorov", email = "andrey.fedorov@gmail.com" }, { name = "Vamsi Thiriveedhi", email = "vthiriveedhi@mgh.harvard.edu" }, diff --git a/scripts/python/generate-indices.py b/scripts/python/generate-indices.py index 73cb8aa..1dc2471 100644 --- a/scripts/python/generate-indices.py +++ b/scripts/python/generate-indices.py @@ -23,10 +23,13 @@ def main(): for file_name in sql_files: file_path = assets_dir / file_name - index_df, output_basename, schema = manager.execute_sql_query(file_path) + index_df, output_basename, schema, sql_query = manager.execute_sql_query( + file_path + ) parquet_file_path = output_dir / f"{output_basename}.parquet" index_df.to_parquet(parquet_file_path) manager.save_schema_to_json(schema, output_basename, output_dir) + manager.save_sql_query(sql_query, output_basename, output_dir) core_indices_dir = scripts_dir.parent / "scripts" / "sql" @@ -34,7 +37,9 @@ def main(): for file_name in sql_files: file_path = core_indices_dir / file_name - index_df, output_basename, schema = manager.execute_sql_query(file_path) + index_df, output_basename, schema, sql_query = manager.execute_sql_query( + file_path + ) parquet_file_path = output_dir / f"{output_basename}.parquet" index_df.to_parquet(parquet_file_path) manager.save_schema_to_json(schema, output_basename, output_dir) diff --git a/scripts/python/idc_index_data_manager.py b/scripts/python/idc_index_data_manager.py index cfd964c..1fd6d65 100644 --- a/scripts/python/idc_index_data_manager.py +++ b/scripts/python/idc_index_data_manager.py @@ -40,7 +40,7 @@ def execute_sql_query( index_df["StudyDate"] = index_df["StudyDate"].astype(str) output_basename = Path(file_path).name.split(".")[0] logger.debug("Executed SQL query from file: %s", file_path) - return index_df, output_basename, schema + return index_df, output_basename, schema, sql_query def save_schema_to_json( self, @@ -79,6 +79,31 @@ def save_schema_to_json( json.dump(schema_dict, f, indent=2) logger.debug("Created schema JSON file: %s", json_file_path) + def save_sql_query( + self, + sql_query: str, + output_basename: str, + output_dir: Path | None = None, + ) -> None: + """ + Saves the SQL query to a file. + + Args: + sql_query: The SQL query string + output_basename: The base name for the output file + output_dir: Optional directory path for the output file + """ + + if output_dir: + output_dir.mkdir(parents=True, exist_ok=True) + query_file_path = output_dir / f"{output_basename}.sql" + else: + query_file_path = Path(f"{output_basename}.sql") + + with query_file_path.open("w") as f: + f.write(sql_query) + logger.debug("Created SQL query file: %s", query_file_path) + def generate_index_data_files( self, generate_compressed_csv: bool = True, @@ -108,7 +133,9 @@ def generate_index_data_files( for file_name in Path.iterdir(sql_dir): if str(file_name).endswith(".sql"): file_path = Path(sql_dir) / file_name - index_df, output_basename, schema = self.execute_sql_query(file_path) + index_df, output_basename, schema, sql_query = self.execute_sql_query( + file_path + ) logger.debug( "Executed and processed SQL queries from file: %s", file_path ) @@ -132,8 +159,10 @@ def generate_index_data_files( index_df.to_parquet(parquet_file_path, compression="zstd") logger.debug("Created Parquet file: %s", parquet_file_path) - # Save schema to JSON file - self.save_schema_to_json(schema, output_basename, output_dir) + # Save schema to JSON file + self.save_schema_to_json(schema, output_basename, output_dir) + # Save SQL query to file + self.save_sql_query(sql_query, output_basename, output_dir) def retrieve_latest_idc_release_version(self) -> int: """ @@ -167,17 +196,24 @@ def retrieve_latest_idc_release_version(self) -> int: "--generate-csv-archive", action="store_true", help="Generate idc_index.csv.zip file", + default=False, ) parser.add_argument( "--generate-parquet", action="store_true", help="Generate idc_index.parquet file", + default=True, ) parser.add_argument( "--retrieve-latest-idc-release-version", action="store_true", help="Retrieve and display the latest IDC release version", ) + parser.add_argument( + "--output-dir", + default="release_artifacts", + help="Directory to save generated files (default: release_artifacts)", + ) args = parser.parse_args() @@ -190,6 +226,7 @@ def retrieve_latest_idc_release_version(self) -> int: IDCIndexDataManager(args.project).generate_index_data_files( generate_compressed_csv=args.generate_csv_archive, generate_parquet=args.generate_parquet, + output_dir=Path(args.output_dir), ) elif args.retrieve_latest_idc_release_version: logging.basicConfig(level=logging.ERROR, force=True) diff --git a/assets/clinical_index.sql b/scripts/sql/clinical_index.sql similarity index 100% rename from assets/clinical_index.sql rename to scripts/sql/clinical_index.sql diff --git a/assets/sm_index.sql b/scripts/sql/sm_index.sql similarity index 100% rename from assets/sm_index.sql rename to scripts/sql/sm_index.sql diff --git a/assets/sm_instance_index.sql b/scripts/sql/sm_instance_index.sql similarity index 100% rename from assets/sm_instance_index.sql rename to scripts/sql/sm_instance_index.sql