Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions pulp_python/app/pypi/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,6 @@ def parse_package(release_package):
@extend_schema(operation_id="pypi_simple_package_read", summary="Get package simple page")
def retrieve(self, request, path, package):
"""Retrieves the simple api html/json page for a package."""
media_type = request.accepted_renderer.media_type

repo_ver, content = self.get_rvc()
# Should I redirect if the normalized name is different?
normalized = canonicalize_name(package)
Expand Down
38 changes: 38 additions & 0 deletions pulp_python/app/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from pulp_python.app.utils import (
DIST_EXTENSIONS,
artifact_to_metadata_artifact,
artifact_to_python_content_data,
get_project_metadata_from_file,
parse_project_metadata,
Expand Down Expand Up @@ -93,11 +94,35 @@ class Meta:
model = python_models.PythonDistribution


class PythonSingleContentArtifactField(core_serializers.SingleContentArtifactField):
"""
Custom field with overridden get_attribute method. Meant to be used only in
PythonPackageContentSerializer to handle possible existence of metadata artifact.
"""

def get_attribute(self, instance):
if instance._artifacts.count() == 0:
return None
elif instance._artifacts.count() == 1:
return instance._artifacts.all()[0]
else:
main_content_artifacts = instance.contentartifact_set.exclude(
relative_path__endswith=".metadata"
)
if main_content_artifacts.exists():
return main_content_artifacts.first().artifact
return instance._artifacts.all()[0]


class PythonPackageContentSerializer(core_serializers.SingleArtifactContentUploadSerializer):
"""
A Serializer for PythonPackageContent.
"""

artifact = PythonSingleContentArtifactField(
help_text=_("Artifact file representing the physical content"),
)

# Core metadata
# Version 1.0
author = serializers.CharField(
Expand Down Expand Up @@ -386,8 +411,21 @@ def deferred_validate(self, data):
if attestations := data.pop("attestations", None):
data["provenance"] = self.handle_attestations(filename, data["sha256"], attestations)

# Create metadata artifact for wheel files
if filename.endswith(".whl"):
if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
data["metadata_artifact"] = metadata_artifact
data["metadata_sha256"] = metadata_artifact.sha256

return data

def get_artifacts(self, validated_data):
artifacts = super().get_artifacts(validated_data)
if metadata_artifact := validated_data.pop("metadata_artifact", None):
relative_path = f"{validated_data['filename']}.metadata"
artifacts[relative_path] = metadata_artifact
return artifacts

def retrieve(self, validated_data):
content = python_models.PythonPackageContent.objects.filter(
sha256=validated_data["sha256"], _pulp_domain=get_domain()
Expand Down
25 changes: 23 additions & 2 deletions pulp_python/app/tasks/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,15 @@ async def create_content(self, pkg):
create a Content Unit to put into the pipeline
"""
declared_contents = {}
page = await aget_remote_simple_page(pkg.name, self.remote)
upstream_pkgs = {pkg.filename: pkg for pkg in page.packages}

for version, dists in pkg.releases.items():
for package in dists:
entry = parse_metadata(pkg.info, version, package)
url = entry.pop("url")
size = package["size"] or None
d_artifacts = []

artifact = Artifact(sha256=entry["sha256"], size=size)
package = PythonPackageContent(**entry)
Expand All @@ -245,11 +249,28 @@ async def create_content(self, pkg):
remote=self.remote,
deferred_download=self.deferred_download,
)
dc = DeclarativeContent(content=package, d_artifacts=[da])
d_artifacts.append(da)

if upstream_pkg := upstream_pkgs.get(entry["filename"]):
if upstream_pkg.has_metadata:
url = upstream_pkg.metadata_url
md_sha256 = upstream_pkg.metadata_digests.get("sha256")
artifact = Artifact(sha256=md_sha256)

metadata_artifact = DeclarativeArtifact(
artifact=artifact,
url=url,
relative_path=f"{entry['filename']}.metadata",
remote=self.remote,
deferred_download=self.deferred_download,
)
d_artifacts.append(metadata_artifact)

dc = DeclarativeContent(content=package, d_artifacts=d_artifacts)
declared_contents[entry["filename"]] = dc
await self.python_stage.put(dc)

if pkg.releases and (page := await aget_remote_simple_page(pkg.name, self.remote)):
if pkg.releases and page:
if self.remote.provenance:
await self.sync_provenance(page, declared_contents)

Expand Down
7 changes: 6 additions & 1 deletion pulp_python/app/tasks/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
Provenance,
verify_provenance,
)
from pulp_python.app.utils import artifact_to_python_content_data
from pulp_python.app.utils import artifact_to_metadata_artifact, artifact_to_python_content_data


def upload(artifact_sha256, filename, attestations=None, repository_pk=None):
Expand Down Expand Up @@ -97,6 +97,11 @@ def create_content(artifact_sha256, filename, domain):
def create():
content = PythonPackageContent.objects.create(**data)
ContentArtifact.objects.create(artifact=artifact, content=content, relative_path=filename)

if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
ContentArtifact.objects.create(
artifact=metadata_artifact, content=content, relative_path=f"{filename}.metadata"
)
return content

new_content = create()
Expand Down
61 changes: 52 additions & 9 deletions pulp_python/app/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import logging
import pkginfo
import re
import shutil
Expand All @@ -14,10 +15,13 @@
from packaging.requirements import Requirement
from packaging.version import parse, InvalidVersion
from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage
from pulpcore.plugin.models import Remote
from pulpcore.plugin.models import Artifact, Remote
from pulpcore.plugin.exceptions import TimeoutException


log = logging.getLogger(__name__)


PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
"""TODO This serial constant is temporary until Python repositories implements serials"""
PYPI_SERIAL_CONSTANT = 1000000000
Expand All @@ -41,6 +45,8 @@
</html>
"""

# TODO in the future: data-requires-python (PEP 503)
# TODO now: strip empty lines
simple_detail_template = """<!DOCTYPE html>
<html>
<head>
Expand All @@ -50,7 +56,11 @@
<body>
<h1>Links for {{ project_name }}</h1>
{% for pkg in project_packages %}
<a href="{{ pkg.url }}#sha256={{ pkg.sha256 }}" rel="internal" {% if pkg.provenance -%}
<a href="{{ pkg.url }}#sha256={{ pkg.sha256 }}"
{% if pkg.metadata_sha256 %}
data-dist-info-metadata="sha256={{ pkg.metadata_sha256 }}"
{% endif %}
rel="internal" {% if pkg.provenance -%}
data-provenance="{{ pkg.provenance }}"{% endif %}>{{ pkg.filename }}</a><br/>
{% endfor %}
</body>
Expand Down Expand Up @@ -200,25 +210,34 @@ def get_project_metadata_from_file(filename):
return metadata


def compute_metadata_sha256(filename: str) -> str | None:
def extract_wheel_metadata(filename: str) -> bytes | None:
"""
Compute SHA256 hash of the metadata file from a Python package.
Extract the metadata file content from a wheel file.

Returns SHA256 hash or None if metadata cannot be extracted.
Returns the raw metadata content as bytes or None if metadata cannot be extracted.
"""
if not filename.endswith(".whl"):
return None
try:
with zipfile.ZipFile(filename, "r") as f:
for file_path in f.namelist():
if file_path.endswith(".dist-info/METADATA"):
metadata_content = f.read(file_path)
return hashlib.sha256(metadata_content).hexdigest()
except (zipfile.BadZipFile, KeyError, OSError):
pass
return f.read(file_path)
except (zipfile.BadZipFile, KeyError, OSError) as e:
log.warning(f"Failed to extract metadata file from {filename}: {e}")
return None


def compute_metadata_sha256(filename: str) -> str | None:
"""
Compute SHA256 hash of the metadata file from a Python package.

Returns SHA256 hash or None if metadata cannot be extracted.
"""
metadata_content = extract_wheel_metadata(filename)
return hashlib.sha256(metadata_content).hexdigest() if metadata_content else None


def artifact_to_python_content_data(filename, artifact, domain=None):
"""
Takes the artifact/filename and returns the metadata needed to create a PythonPackageContent.
Expand All @@ -227,6 +246,7 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
# because pkginfo validates that the filename has a valid extension before
# reading it
with tempfile.NamedTemporaryFile("wb", dir=".", suffix=filename) as temp_file:
artifact.file.seek(0)
shutil.copyfileobj(artifact.file, temp_file)
temp_file.flush()
metadata = get_project_metadata_from_file(temp_file.name)
Expand All @@ -239,6 +259,28 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
return data


def artifact_to_metadata_artifact(filename: str, artifact: Artifact) -> Artifact | None:
"""
Creates artifact for metadata from the provided wheel artifact.
"""
if not filename.endswith(".whl"):
return None

with tempfile.NamedTemporaryFile("wb", dir=".", suffix=filename) as temp_file:
artifact.file.seek(0)
shutil.copyfileobj(artifact.file, temp_file)
temp_file.flush()
metadata_content = extract_wheel_metadata(temp_file.name)
if not metadata_content:
return None
with tempfile.NamedTemporaryFile(suffix=".metadata") as metadata_temp:
metadata_temp.write(metadata_content)
metadata_temp.flush()
metadata_artifact = Artifact.init_and_validate(metadata_temp.name)
metadata_artifact.save()
return metadata_artifact


def fetch_json_release_metadata(name: str, version: str, remotes: set[Remote]) -> dict:
"""
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
Expand Down Expand Up @@ -402,6 +444,7 @@ def find_artifact():
_art = models.RemoteArtifact.objects.filter(content_artifact=content_artifact).first()
return _art

# todo: fix .first()
content_artifact = content.contentartifact_set.first()
artifact = find_artifact()
origin = settings.CONTENT_ORIGIN or settings.PYPI_API_HOSTNAME or ""
Expand Down
Loading
Loading