diff --git a/argo/capoci-e2e-workflow.yaml b/argo/capoci-e2e-workflow.yaml new file mode 100644 index 00000000..95a5327a --- /dev/null +++ b/argo/capoci-e2e-workflow.yaml @@ -0,0 +1,368 @@ +# Argo Workflow to run CAPOCI e2e tests via scripts/ci-e2e.sh +# - Uses docker-in-docker (dind) sidecar so the Makefile can docker build/push +# - Installs minimal tooling in the main container (git, make, curl, openssh-client, docker CLI) +# - Clones this repository and invokes ./scripts/ci-e2e.sh (which does the build and testing) +# - Maps parameters to environment variables consumed by the script and e2e config +# +# Submit directly (no template install needed): +# argo submit argo/capoci-e2e-workflow.yaml \ +# -p oci_compartment_id=ocid1.compartment.oc1..example \ +# -p oci_image_id=ocid1.image.oc1..example \ +# -p oci_oracle_linux_image_id=ocid1.image.oc1..example \ +# -p oci_upgrade_image_id=ocid1.image.oc1..example \ +# -p oci_managed_node_image_id=ocid1.image.oc1..optional \ +# -p oci_alternative_region_image_id=ocid1.image.oc1..example \ +# -p registry=ghcr.io/your-org \ +# -p use_instance_principal=true \ +# -p use_instance_principal_b64=dHJ1ZQ== \ +# -p oci_ssh_key="ssh-rsa AAAA... your-key" +# +# Notes: +# - USE_INSTANCE_PRINCIPAL_B64 commonly needs "dHJ1ZQ==" for true, "ZmFsc2U=" for false (base64). +# - If OCI_SSH_KEY is omitted, the script will generate a temporary key. +# - REGISTRY defaults to ghcr.io/oracle; override if pushing to your org's registry. +# - git_ref defaults to main +# - git_repo defaults to https://github.com/oracle/cluster-api-provider-oci.git +# - ginkgo_focus can be used to run a single test + +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + namespace: argo + generateName: capoci-e2e- +spec: + entrypoint: run + serviceAccountName: argo + parallelism: 1 + podGC: + strategy: OnWorkflowSuccess + ttlStrategy: + secondsAfterSuccess: 86400 + secondsAfterFailure: 604800 + + # Default parameter values (override at submit time) + arguments: + parameters: + - name: git_repo + value: https://github.com/oracle/cluster-api-provider-oci.git + - name: git_ref + value: main + + - name: registry + value: ghcr.io/oracle + + # REQUIRED by scripts/ci-e2e.sh + - name: oci_compartment_id + value: "" + - name: oci_image_id + value: "" + - name: oci_oracle_linux_image_id + value: "" + - name: oci_upgrade_image_id + value: "" + - name: oci_alternative_region_image_id + value: "" + - name: oci_managed_node_image_id + value: "" + + # Optional image IDs + - name: oci_windows_image_id + value: "" + + # Optional SSH public key. If empty, the script generates one. + - name: oci_ssh_key + value: "" + + # Feature/behavior toggles + - name: cluster_topology + value: "true" + - name: exp_machine_pool + value: "true" + - name: exp_oke + value: "false" + - name: use_instance_principal + value: "false" + # Base64 of "true" or "false". Common values: true=dHJ1ZQ==, false=ZmFsc2U= + - name: use_instance_principal_b64 + value: "dHJ1ZQ==" + - name: oci_alternative_region + value: "us-sanjose-1" + + # Test execution tuning + - name: ginkgo_nodes + value: "3" + - name: ginkgo_focus + value: "" + - name: kind_node_image + value: "kindest/node:v1.29.6" + - name: tag + value: "" + - name: node_machine_count + value: "1" + - name: ocir_username + value: "" + - name: ocir_region + value: "us-phoenix-1" + - name: ocir_region_short_code + value: "phx" + + templates: + - name: run + outputs: + artifacts: + - name: e2e-report + path: /workspace/report.json + optional: true + - name: e2e-logs + path: /workspace/_artifacts_logs_only + optional: true + - name: e2e-artifacts + path: /workspace/_artifacts + optional: true + volumes: + - name: workspace + emptyDir: {} + - name: dind-storage + emptyDir: {} + + sidecars: + - name: dind + image: docker.io/library/docker:24-dind + args: + - "--host=tcp://0.0.0.0:2375" + - "--host=unix:///var/run/docker.sock" + env: + - name: DOCKER_TLS_CERTDIR + value: "" + readinessProbe: + tcpSocket: + port: 2375 + initialDelaySeconds: 2 + periodSeconds: 2 + securityContext: + privileged: true + resources: + requests: + cpu: "4" + memory: "8Gi" + limits: + cpu: "8" + memory: "16Gi" + volumeMounts: + - name: dind-storage + mountPath: /var/lib/docker + + container: + image: docker.io/library/golang:1.23-bookworm + workingDir: /workspace + securityContext: + runAsUser: 0 + resources: + requests: + cpu: "4" + memory: "8Gi" + limits: + cpu: "8" + memory: "16Gi" + env: + # Docker socket via dind sidecar + - name: DOCKER_HOST + value: tcp://localhost:2375 + + # Registry for docker build/push from the Makefile + - name: REGISTRY + value: "{{workflow.parameters.registry}}" + + # Required envs for scripts/ci-e2e.sh + - name: OCI_COMPARTMENT_ID + value: "{{workflow.parameters.oci_compartment_id}}" + - name: OCI_IMAGE_ID + value: "{{workflow.parameters.oci_image_id}}" + - name: OCI_ORACLE_LINUX_IMAGE_ID + value: "{{workflow.parameters.oci_oracle_linux_image_id}}" + - name: OCI_UPGRADE_IMAGE_ID + value: "{{workflow.parameters.oci_upgrade_image_id}}" + - name: OCI_MANAGED_NODE_IMAGE_ID + value: "{{workflow.parameters.oci_managed_node_image_id}}" + # Also set the variant used by e2e_conf.yaml envsubst. + - name: KUBERNETES_UPGRADE_OCI_IMAGE_ID + value: "{{workflow.parameters.oci_upgrade_image_id}}" + - name: OCI_ALTERNATIVE_REGION_IMAGE_ID + value: "{{workflow.parameters.oci_alternative_region_image_id}}" + + # Optional envs used by templates/config + - name: OCI_WINDOWS_IMAGE_ID + value: "{{workflow.parameters.oci_windows_image_id}}" + - name: OCI_SSH_KEY + value: "{{workflow.parameters.oci_ssh_key}}" + - name: OCI_ALTERNATIVE_REGION + value: "{{workflow.parameters.oci_alternative_region}}" + + # Feature gates and behavior flags + - name: CLUSTER_TOPOLOGY + value: "{{workflow.parameters.cluster_topology}}" + - name: EXP_MACHINE_POOL + value: "{{workflow.parameters.exp_machine_pool}}" + - name: EXP_OKE + value: "{{workflow.parameters.exp_oke}}" + - name: USE_INSTANCE_PRINCIPAL + value: "{{workflow.parameters.use_instance_principal}}" + - name: USE_INSTANCE_PRINCIPAL_B64 + value: "{{workflow.parameters.use_instance_principal_b64}}" + + # Test runner config + - name: GINKGO_NODES + value: "{{workflow.parameters.ginkgo_nodes}}" + - name: GINKGO_FOCUS_PARAM + value: "{{workflow.parameters.ginkgo_focus}}" + - name: KIND_NODE_IMAGE + value: "{{workflow.parameters.kind_node_image}}" + - name: KIND_IMAGE + value: "{{workflow.parameters.kind_node_image}}" + - name: KIND_CLUSTER_IMAGE + value: "{{workflow.parameters.kind_node_image}}" + - name: KIND_EXPERIMENTAL_IMAGE + value: "{{workflow.parameters.kind_node_image}}" + - name: TAG + value: "{{workflow.parameters.tag}}" + - name: NODE_MACHINE_COUNT + value: "{{workflow.parameters.node_machine_count}}" + - name: OCIR_USERNAME + value: "{{workflow.parameters.ocir_username}}" + - name: OCIR_REGION + value: "{{workflow.parameters.ocir_region}}" + - name: OCIR_REGION_SHORT_CODE + value: "{{workflow.parameters.ocir_region}}" + - name: KIND_CGROUP_DRIVER + value: "cgroupfs" + + command: ["bash", "-ec"] + args: + - | + set -o errexit + set -o nounset + set -o pipefail + + # Base tooling for the build and e2e scripts + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + git make curl ca-certificates openssh-client jq + update-ca-certificates + + # Install docker CLI to talk to dind sidecar + # (use static binary to avoid extra package repos) + curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-24.0.9.tgz \ + | tar -xz -C /usr/local/bin --strip-components=1 docker/docker + for i in {1..60}; do + if docker version >/dev/null 2>&1; then + break + fi + echo "waiting for docker daemon on ${DOCKER_HOST} ..." + sleep 2 + done + docker version + + # Ensure binaries installed by the Makefile are on PATH (kustomize, ginkgo, etc.) + export PATH="/workspace/bin:${PATH}" + # Also provide kustomize in a standard location for tools that exec 'kustomize' by name. + ln -sf /workspace/bin/kustomize /usr/local/bin/kustomize || true + ln -sf /workspace/bin/ginkgo /usr/local/bin/ginkgo || true + ln -sf /workspace/bin/kubectl /usr/local/bin/kubectl || true + command -v kustomize || true + kustomize version || true + command -v kubectl || true + kubectl version --client=true --short || true + + # Install OCI CLI for docker authentication to OCIR + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3 python3-venv python3-pip unzip curl jq + export PATH="/root/.local/bin:${PATH}" + python3 -m venv /opt/oci-cli + /opt/oci-cli/bin/pip install --upgrade pip + /opt/oci-cli/bin/pip install "oci_cli==3.71.1" + ln -sf /opt/oci-cli/bin/oci /usr/local/bin/oci + + oci --version || true + + # Setup OCIR token and login + echo "Generating the ocir token" + echo "https://{{workflow.parameters.ocir_region}}.ocir.io/20180419/docker/token" + OCIR_TOKEN=$(oci raw-request \ + --http-method GET \ + --target-uri "https://{{workflow.parameters.ocir_region}}.ocir.io/20180419/docker/token" \ + --auth instance_principal \ + --region "{{workflow.parameters.ocir_region}}" \ + --profile DEFAULT | jq -r .data.token) + + echo "docker login" + echo "${OCIR_TOKEN}" | docker login -u BEARER_TOKEN --password-stdin "{{workflow.parameters.ocir_region_short_code}}.ocir.io" + + # Clone repository and checkout ref + git clone --depth 1 --branch "{{workflow.parameters.git_ref}}" "{{workflow.parameters.git_repo}}" /workspace + + # Optional Ginkgo focus override + if [ -n "${GINKGO_FOCUS_PARAM:-}" ]; then + export GINKGO_FOCUS="${GINKGO_FOCUS_PARAM}" + fi + + # Preflight: show which Kind node image will be used + echo "KIND_IMAGE=${KIND_IMAGE:-unset} KIND_NODE_IMAGE=${KIND_NODE_IMAGE:-unset} KIND_CLUSTER_IMAGE=${KIND_CLUSTER_IMAGE:-unset} KIND_EXPERIMENTAL_IMAGE=${KIND_EXPERIMENTAL_IMAGE:-unset}" + + # Start capoci-controller-manager log watcher in background (waits until 'kind' exists) + export CLUSTER_NAME="${CLUSTER_NAME:-capoci-e2e}" + export OUTPUT_DIR="/workspace/_artifacts" + ( until command -v kind >/dev/null 2>&1; do echo "waiting for kind binary..."; sleep 2; done; \ + ./scripts/watch-capoci-controller.sh -n "${CLUSTER_NAME}" -o "${OUTPUT_DIR}" -r 180 -d 5 ) \ + & echo $! > /tmp/capoci_watch_pid + + # Run the e2e test harness + # This builds and runs the actual test. Everything else is package install and setup or reporting + ./scripts/ci-e2e.sh + + # Stop watcher if running + if [ -f /tmp/capoci_watch_pid ]; then + kill "$(cat /tmp/capoci_watch_pid)" >/dev/null 2>&1 || true + rm -f /tmp/capoci_watch_pid + fi + + # Build logs-only artifacts (logs + report.json), exclude YAML and other files + mkdir -p /workspace/_artifacts_logs_only + if [ -d /workspace/_artifacts ]; then + find /workspace/_artifacts -type f -name '*.log' -exec cp --parents {} /workspace/_artifacts_logs_only \; || true + fi + if [ -f /workspace/report.json ]; then + cp /workspace/report.json /workspace/_artifacts_logs_only/ || true + fi + + # Emit the ginkgo JSON report into logs for easy retrieval + if [ -f /workspace/report.json ]; then + echo "=== BEGIN report.json ===" + cat /workspace/report.json || true + echo "=== END report.json ===" + fi + + # Best-effort: surface CAPOCI controller logs from the kind mgmt cluster (if still present) + if command -v kubectl >/dev/null 2>&1; then + echo "=== capoci-controller-manager logs (cluster-api-provider-oci-system) ===" + LOG_DIR="/workspace/_artifacts/capoci-controller-manager" + mkdir -p "${LOG_DIR}" + # Save pod listing and logs to files while also emitting to stdout + kubectl --context kind-capoci-e2e -n cluster-api-provider-oci-system get pods -o wide | tee "${LOG_DIR}/pods.txt" || true + pods=$(kubectl --context kind-capoci-e2e -n cluster-api-provider-oci-system get pods -l app=capoci-controller-manager -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true) + for p in $pods; do + echo "--- logs for pod: $p" + kubectl --context kind-capoci-e2e -n cluster-api-provider-oci-system logs "$p" --all-containers=true --tail=-1 | tee "${LOG_DIR}/${p}.log" || true + done + if [ -z "${pods}" ]; then + # Fallback to Deployment name if label not present + kubectl --context kind-capoci-e2e -n cluster-api-provider-oci-system logs deploy/capoci-controller-manager --all-containers=true --tail=-1 | tee "${LOG_DIR}/deployment.log" || true + fi + fi + + # Rebuild logs-only artifacts to include controller logs saved above + if [ -d /workspace/_artifacts ]; then + find /workspace/_artifacts -type f -name '*.log' -exec cp --parents {} /workspace/_artifacts_logs_only \; || true + fi + + volumeMounts: + - name: workspace + mountPath: /workspace diff --git a/hack/ensure-kind.sh b/hack/ensure-kind.sh index 9f257785..1d19e5f5 100755 --- a/hack/ensure-kind.sh +++ b/hack/ensure-kind.sh @@ -6,7 +6,7 @@ set -o nounset set -o pipefail GOPATH_BIN="$(go env GOPATH)/bin/" -MINIMUM_KIND_VERSION=v0.10.0 +MINIMUM_KIND_VERSION=v0.25.0 # Ensure the kind tool exists and is a viable version, or installs it verify_kind_version() { @@ -39,3 +39,5 @@ EOF } verify_kind_version +# Print kind version for visibility in CI logs +kind version || true diff --git a/scripts/watch-capoci-controller.sh b/scripts/watch-capoci-controller.sh new file mode 100644 index 00000000..b8a9ee63 --- /dev/null +++ b/scripts/watch-capoci-controller.sh @@ -0,0 +1,198 @@ +#!/usr/bin/env bash +# +# Watch for capoci-controller-manager pod and follow its logs. +# +# Features: +# 1) Attempts to connect to a local kind cluster (kind export kubeconfig --name ) with retries. +# 2) Waits for the capoci-controller-manager-xxxx pod in the namespace cluster-api-provider-oci-system to be Ready. +# 3) Follows the pod logs and also writes them to a file in the specified output directory (defaults to this script's directory). +# +# Usage: +# scripts/watch-capoci-controller.sh [--name ] [--output-dir ] [--retries N] [--delay SECONDS] +# +# Examples: +# scripts/watch-capoci-controller.sh +# scripts/watch-capoci-controller.sh -n capoci-e2e -o ./_artifacts -r 90 -d 5 +# +# Requirements: +# - kind +# - kubectl + +set -Eeuo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" + +CLUSTER_NAME="${CLUSTER_NAME:-capoci-e2e}" +OUTPUT_DIR="${OUTPUT_DIR:-$SCRIPT_DIR}" +RETRIES="${RETRIES:-60}" # total attempts for various waits +DELAY_SECONDS="${DELAY_SECONDS:-5}" +NAMESPACE="cluster-api-provider-oci-system" +POD_PREFIX="capoci-controller-manager" + +usage() { + cat < Kind cluster name (default: ${CLUSTER_NAME}) + -o, --output-dir Directory for log output (default: ${OUTPUT_DIR}) + -r, --retries Number of retry attempts (default: ${RETRIES}) + -d, --delay Delay between retries in seconds (default: ${DELAY_SECONDS}) + -h, --help Show this help and exit + +Environment variables also supported: + CLUSTER_NAME, OUTPUT_DIR, RETRIES, DELAY_SECONDS +EOF +} + +log() { + printf '[%(%Y-%m-%dT%H:%M:%S%z)T] %s\n' -1 "$*" +} + +err() { + printf '[%(%Y-%m-%dT%H:%M:%S%z)T] ERROR: %s\n' -1 "$*" >&2 +} + +require_bin() { + if ! command -v "$1" >/dev/null 2>&1; then + err "Required binary not found in PATH: $1" + exit 1 + fi +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + -n|--name) + CLUSTER_NAME="$2"; shift 2 ;; + -o|--output-dir) + OUTPUT_DIR="$2"; shift 2 ;; + -r|--retries) + RETRIES="$2"; shift 2 ;; + -d|--delay) + DELAY_SECONDS="$2"; shift 2 ;; + -h|--help) + usage; exit 0 ;; + --) + shift; break ;; + -*) + err "Unknown option: $1" + usage + exit 2 ;; + *) + # ignore stray args for now + shift ;; + esac + done +} + +ensure_kind_kubeconfig_and_apiserver() { + local context="kind-${CLUSTER_NAME}" + local attempt=1 + + while (( attempt <= RETRIES )); do + log "[$attempt/$RETRIES] Exporting kubeconfig for kind cluster '${CLUSTER_NAME}'..." + # Export kubeconfig; this merges/updates current kubeconfig + if ! kind export kubeconfig --name "${CLUSTER_NAME}" >/dev/null 2>&1; then + log "kind export kubeconfig not yet ready, retrying after ${DELAY_SECONDS}s..." + sleep "${DELAY_SECONDS}" + ((attempt++)) + continue + fi + + # Verify the context exists and API is reachable + if kubectl --context "${context}" get --raw=/healthz >/dev/null 2>&1 || \ + kubectl --context "${context}" get ns >/dev/null 2>&1; then + log "Kubernetes API for context '${context}' is reachable." + # Set current context to ensure subsequent kubectl commands target this cluster + kubectl config use-context "${context}" >/dev/null + return 0 + fi + + log "API server for '${context}' not yet reachable, retrying after ${DELAY_SECONDS}s..." + sleep "${DELAY_SECONDS}" + ((attempt++)) + done + + err "Failed to connect to kind cluster '${CLUSTER_NAME}' after ${RETRIES} attempts." + exit 1 +} + +wait_for_namespace() { + local attempt=1 + while (( attempt <= RETRIES )); do + if kubectl get ns "${NAMESPACE}" >/dev/null 2>&1; then + log "Namespace '${NAMESPACE}' is present." + return 0 + fi + log "[$attempt/$RETRIES] Waiting for namespace '${NAMESPACE}' to exist... retrying in ${DELAY_SECONDS}s" + sleep "${DELAY_SECONDS}" + ((attempt++)) + done + + err "Namespace '${NAMESPACE}' did not appear after ${RETRIES} attempts." + exit 1 +} + +find_controller_pod_name() { + # returns first matching pod name or empty string + kubectl get pods -n "${NAMESPACE}" --no-headers 2>/dev/null \ + | awk -v pfx="^${POD_PREFIX}-" '$1 ~ pfx {print $1; exit}' +} + +wait_for_controller_pod_ready() { + local attempt=1 + local pod="" + + while (( attempt <= RETRIES )); do + pod="$(find_controller_pod_name || true)" + if [[ -n "${pod}" ]]; then + log "Found pod '${pod}', waiting for Ready condition..." + if kubectl -n "${NAMESPACE}" wait --for=condition=Ready "pod/${pod}" --timeout=60s >/dev/null 2>&1; then + log "Pod '${pod}' is Ready." + echo "${pod}" + return 0 + fi + log "Pod '${pod}' not Ready yet, will retry..." + else + log "[$attempt/$RETRIES] Controller pod with prefix '${POD_PREFIX}-' not found yet..." + fi + + sleep "${DELAY_SECONDS}" + ((attempt++)) + done + + err "Controller pod '${POD_PREFIX}-xxxx' did not become Ready after ${RETRIES} attempts." + exit 1 +} + +main() { + parse_args "$@" + require_bin kind + require_bin kubectl + + mkdir -p "${OUTPUT_DIR}" + + log "Using configuration: cluster='${CLUSTER_NAME}', namespace='${NAMESPACE}', output-dir='${OUTPUT_DIR}', retries=${RETRIES}, delay=${DELAY_SECONDS}s" + + ensure_kind_kubeconfig_and_apiserver + wait_for_namespace + + local pod_name + pod_name="$(wait_for_controller_pod_ready)" + + local ts + ts="$(date +%Y%m%d-%H%M%S)" + local log_file="${OUTPUT_DIR}/${POD_PREFIX}-${CLUSTER_NAME}-${ts}.log" + + log "Following logs for pod '${pod_name}' in namespace '${NAMESPACE}'. Logs will also be written to: ${log_file}" + log "Press Ctrl+C to stop streaming logs. The file will remain at: ${log_file}" + + # Stream logs, mirror to file. + # Note: If the pod restarts or gets re-created, this will follow the current pod instance only. + # If you prefer following the deployment instead, replace with: + # kubectl -n "${NAMESPACE}" logs -f deploy/${POD_PREFIX} | tee "${log_file}" + kubectl -n "${NAMESPACE}" logs -f "${pod_name}" | tee "${log_file}" +} + +main "$@"