diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/README.md b/README.md index 20449e0..6827ded 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -## SageMaker Studio Custom Image Samples +# SageMaker Studio Custom Image Samples -### Overview +## Overview This repository contains examples of Docker images that are valid custom images for KernelGateway Apps in SageMaker Studio. These custom images enable you to bring your own packages, files, and kernels for use with notebooks, terminals, and interactive consoles within SageMaker Studio. @@ -13,11 +13,13 @@ This repository contains examples of Docker images that are valid custom images - [rapids-image](examples/rapids-image) - This example uses the offical rapids.ai image from Dockerhub. Use with a GPU instance on Studio - [scala-image](examples/scala-image) - This example adds a Scala kernel based on [Almond Scala Kernel](https://almond.sh/). - [tf2.3-image](examples/tf23-image) - This examples uses the official TensorFlow 2.3 image from DockerHub and demonstrates bundling custom files along with the image. +- [spark-image](examples/spark-image) - This example gives the spark image to do python spark code developement in SageMaker studio. + #### One-time setup All examples have a one-time setup to create an ECR repository -``` +```bash REGION= aws --region ${REGION} ecr create-repository \ --repository-name smstudio-custom @@ -29,4 +31,4 @@ See [DEVELOPMENT.md](DEVELOPMENT.md) ### License -This sample code is licensed under the MIT-0 License. See the LICENSE file. \ No newline at end of file +This sample code is licensed under the MIT-0 License. See the LICENSE file. diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/examples/echo-kernel-image/Dockerfile.orig b/examples/echo-kernel-image/Dockerfile.orig new file mode 100644 index 0000000..8274702 --- /dev/null +++ b/examples/echo-kernel-image/Dockerfile.orig @@ -0,0 +1,38 @@ +FROM python:3.6 + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + + +###################### +# OVERVIEW +# 1. Creates the `sagemaker-user` user with UID/GID 1000/100. +# 2. Ensures this user can `sudo` by default. +# 3. Install the echo kernel from PyPI and install its dependencies. +# 4. Make the default shell `bash`. This enhances the experience inside a Jupyter terminal as otherwise Jupyter defaults to `sh` +###################### + +# Setup the "sagemaker-user" user with root privileges. +RUN \ + apt-get update && \ + apt-get install -y sudo && \ + useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \ + chmod g+w /etc/passwd && \ + echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + # Prevent apt-get cache from being persisted to this layer. + rm -rf /var/lib/apt/lists/* + +# Install and configure the kernel. +RUN \ + pip install echo_kernel \ + # These are dependencies of echo_kernel but the version on PyPI is old and doesn't declare them correctly. + jupyter_client IPython ipykernel && \ + # This ensures that the kernelspec.json is installed in location expected by Jupyter/KernelGateway. + python -m echo_kernel.install --sys-prefix + +# Make the default shell bash (vs "sh") for a better Jupyter terminal UX +ENV SHELL=/bin/bash + +USER $NB_UID + diff --git a/examples/spark-image/Dockerfile b/examples/spark-image/Dockerfile new file mode 100644 index 0000000..22fc7b1 --- /dev/null +++ b/examples/spark-image/Dockerfile @@ -0,0 +1,165 @@ +FROM ubuntu:18.04 + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +# Setup the "sagemaker-user" user with root privileges. +RUN apt-get update && \ + apt-get install -y sudo && \ + useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \ + chmod g+w /etc/passwd && \ + echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + # Prevent apt-get cache from being persisted to this layer. + apt-get clean && rm -rf /var/lib/apt/lists/* && \ + echo "en_US.UTF-8 UTF-8" > /etc/locale.gen + +USER $NB_UID + +# Make the default shell bash (vs "sh") for a better Jupyter terminal UX +ENV SHELL=/bin/bash \ + NB_USER=$NB_USER \ + NB_UID=$NB_UID \ + NB_GID=$NB_GID \ + LC_ALL=en_US.UTF-8 \ + LANG=en_US.UTF-8 \ + LANGUAGE=en_US.UTF-8 \ + HOME=/home/$NB_USER \ + MINICONDA_VERSION=4.6.14 \ + CONDA_VERSION=4.6.14 \ + MINICONDA_MD5=718259965f234088d785cad1fbd7de03 \ + CONDA_DIR=/opt/conda \ + PATH=$CONDA_DIR/bin:${PATH} + + +USER root +RUN apt-get update --yes +# COPY ./apt-packages.txt /root/apt-packages.txt +# RUN xargs -a /root/apt-packages.txt apt-get install -y --no-install-recommends + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + wget \ + curl \ + ca-certificates \ + sudo \ + locales \ + fonts-liberation \ + run-one && \ + apt-get clean && rm -rf /var/lib/apt/lists/* && \ + echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && \ + locale-gen + +RUN mkdir -p $CONDA_DIR && \ + chown -R $NB_USER:$NB_GID $CONDA_DIR && \ + # Fix for devtools https://github.com/conda-forge/r-devtools-feedstock/issues/4 + ln -s /bin/tar /bin/gtar + +# Copy a script that we will use to correct permissions after running certain commands +COPY fix-permissions /usr/local/bin/fix-permissions +RUN chmod a+rx /usr/local/bin/fix-permissions +COPY fix-host-settings /usr/local/bin/fix-host-settings +RUN chmod a+rx /usr/local/bin/fix-host-settings + +RUN echo "auth requisite pam_deny.so" >> /etc/pam.d/su && \ + sed -i.bak -e 's/^%admin/#%admin/' /etc/sudoers && \ + sed -i.bak -e 's/^%sudo/#%sudo/' /etc/sudoers && \ + usermod -G root ${NB_USER} && \ + mkdir -p "${CONDA_DIR}" && \ + chown "${NB_USER}:${NB_GID}" "${CONDA_DIR}" && \ + chmod g+w /etc/passwd && \ + fix-permissions "${HOME}" && \ + fix-permissions "${CONDA_DIR}" + +USER ${NB_UID} +ARG PYTHON_VERSION=3.6.14 +ENV PATH=$CONDA_DIR/bin:${PATH} +WORKDIR /tmp + +# Install conda via Miniconda +RUN curl --silent --show-error --output miniconda-installer.sh https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh && \ + echo "${MINICONDA_MD5} *miniconda-installer.sh" | md5sum -c - && \ + /bin/bash miniconda-installer.sh -f -b -p $CONDA_DIR && \ + rm miniconda-installer.sh && \ + conda config --system --prepend channels conda-forge && \ + conda config --system --set auto_update_conda false && \ + conda config --system --set show_channel_urls true && \ + conda config --system --set pip_interop_enabled true && \ + conda install --quiet --yes conda="${CONDA_VERSION%.*}.*" && \ + conda update --all --quiet --yes && \ + conda clean --all -f -y && \ + rm -rf /home/$NB_USER/.cache/yarn + +RUN conda install --quiet --yes \ + tini \ + boto3 \ + 'awscli>=1.18' \ + sagemaker_pyspark \ + 'pyspark==2.4.0' \ + 'notebook=6.4.0' \ + 'jupyterhub=1.4.1' \ + 'jupyterlab=3.0.16' && \ + conda clean --all -f -y && \ + npm cache clean --force && \ + jupyter notebook --generate-config && \ + jupyter lab clean && \ + rm -rf "/home/${NB_USER}/.cache/yarn" && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +EXPOSE 8888 + +# Copy local files as late as possible to avoid cache busting +COPY start.sh start-notebook.sh start-singleuser.sh /usr/local/bin/ +# Currently need to have both jupyter_notebook_config and jupyter_server_config to support classic and lab +COPY jupyter_notebook_config.py /etc/jupyter/ + +# Fix permissions on /etc/jupyter as root +USER root + +# Prepare upgrade to JupyterLab V3.0 #1205 +RUN sed -re "s/c.NotebookApp/c.ServerApp/g" \ + /etc/jupyter/jupyter_notebook_config.py > /etc/jupyter/jupyter_server_config.py && \ + fix-permissions /etc/jupyter/ + +# Spark dependencies +# Default values can be overridden at build time +# (ARGS are in lower case to distinguish them from ENV) +ARG spark_version="2.4.0" +ARG hadoop_version="2.7" +ARG spark_checksum="5F4184E0FE7E5C8AE67F5E6BC5DEEE881051CC712E9FF8AEDDF3529724C00E402C94BB75561DD9517A372F06C1FCB78DC7AE65DCBD4C156B3BA4D8E267EC2936" +ARG openjdk_version="8" + +ENV APACHE_SPARK_VERSION="${spark_version}" \ + HADOOP_VERSION="${hadoop_version}" + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + "openjdk-${openjdk_version}-jre-headless" \ + ca-certificates-java && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Spark installation +WORKDIR /tmp +RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ + echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \ + tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \ + rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" + +WORKDIR /usr/local + +# Configure Spark +ENV SPARK_HOME=/usr/local/spark +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \ + PATH="${PATH}:${SPARK_HOME}/bin" + +RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark && \ + # Add a link in the before_notebook hook in order to source automatically PYTHONPATH + mkdir -p /usr/local/bin/before-notebook.d && \ + ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh + +USER ${NB_UID} + +WORKDIR $HOME diff --git a/examples/spark-image/README.md b/examples/spark-image/README.md new file mode 100644 index 0000000..11ba33b --- /dev/null +++ b/examples/spark-image/README.md @@ -0,0 +1,67 @@ +# Spark Image + +## Overview + +This image is Spark kernel as a Custom Image in SageMaker Studio. This custom image can be used to do interactive spark development in Python. This allow to read and write data from Amazon S3 buckets. + +The image is based on Spark version 2.4.0, Hadoop version 2.7 and openjdk 8. This image also have latest version of sagemaker_pyspark (1.4.2), to have aws-hadoop and other dependents jar, to work with AWS services like Amazon simple storage service (s3). + +Example notebook (pyspark-kernel-file-read) explains on how to use different credential provider to make calls to S3. + +### Building the image + +Build the Docker image and push to Amazon ECR. + +```bash +# Modify these as required. The Docker registry endpoint can be tuned based on your current region from https://docs.aws.amazon.com/general/latest/gr/ecr.html#ecr-docker-endpoints +REGION= +ACCOUNT_ID= + + +# Build the image +IMAGE_NAME=spark-kernel +aws --region ${REGION} ecr get-login-password | docker login --username AWS --password-stdin ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom +docker build . -t ${IMAGE_NAME} -t ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME} +``` + +```bash +docker push ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME} +``` + +### Using with SageMaker Studio + +Create a SageMaker Image with the image in ECR. + +```bash +# Role in your account to be used for the SageMaker Image +ROLE_ARN= + +aws --region ${REGION} sagemaker create-image \ + --image-name ${IMAGE_NAME} \ + --role-arn ${ROLE_ARN} + +aws --region ${REGION} sagemaker create-image-version \ + --image-name ${IMAGE_NAME} \ + --base-image "${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME}" + +# Verify the image-version is created successfully. Do NOT proceed if image-version is in CREATE_FAILED state or in any other state apart from CREATED. +aws --region ${REGION} sagemaker describe-image-version --image-name ${IMAGE_NAME} +``` + +Create a AppImageConfig for this image + +```bash +aws --region ${REGION} sagemaker create-app-image-config --cli-input-json file://app-image-config-input.json +``` + +Create a Domain, providing the SageMaker Image and AppImageConfig in the Domain creation. Replace the placeholders for VPC ID, Subnet IDs, and Execution Role in `create-domain-input.json` + +```bash +aws --region ${REGION} sagemaker create-domain --cli-input-json file://create-domain-input.json +``` + +If you have an existing Domain, you can also use the `update-domain` + +```bash +aws --region ${REGION} sagemaker update-domain --cli-input-json file://update-domain-input.json +``` diff --git a/examples/spark-image/app-image-config-input.json b/examples/spark-image/app-image-config-input.json new file mode 100644 index 0000000..58851f2 --- /dev/null +++ b/examples/spark-image/app-image-config-input.json @@ -0,0 +1,16 @@ +{ + "AppImageConfigName": "custom-spark-image-config", + "KernelGatewayImageConfig": { + "KernelSpecs": [ + { + "Name": "python3", + "DisplayName": "spark" + } + ], + "FileSystemConfig": { + "MountPath": "/home/sagemaker-user", + "DefaultUid": 1000, + "DefaultGid": 100 + } + } +} \ No newline at end of file diff --git a/examples/spark-image/create-domain-input.json b/examples/spark-image/create-domain-input.json new file mode 100644 index 0000000..d3382ee --- /dev/null +++ b/examples/spark-image/create-domain-input.json @@ -0,0 +1,19 @@ +{ + "DomainName": "domain-with-spark-kernel-image", + "VpcId": "", + "SubnetIds": [ + "" + ], + "DefaultUserSettings": { + "ExecutionRole": "", + "KernelGatewayAppSettings": { + "CustomImages": [ + { + "ImageName": "spark-kernel", + "AppImageConfigName": "custom-spark-image-config" + } + ] + } + }, + "AuthMode": "IAM" +} \ No newline at end of file diff --git a/examples/spark-image/fix-host-settings b/examples/spark-image/fix-host-settings new file mode 100644 index 0000000..3f7a128 --- /dev/null +++ b/examples/spark-image/fix-host-settings @@ -0,0 +1,4 @@ +#!/bin/sh +sudo -i chmod 777 /etc/hosts > /dev/null 2>&1 +if !(grep -qi $HOSTNAME /etc/hosts); then echo "127.0.0.1 ${HOSTNAME}" >> /etc/hosts; fi +cat /etc/hosts \ No newline at end of file diff --git a/examples/spark-image/fix-permissions b/examples/spark-image/fix-permissions new file mode 100644 index 0000000..0969275 --- /dev/null +++ b/examples/spark-image/fix-permissions @@ -0,0 +1,35 @@ +#!/bin/bash +# set permissions on a directory +# after any installation, if a directory needs to be (human) user-writable, +# run this script on it. +# It will make everything in the directory owned by the group ${NB_GID} +# and writable by that group. +# Deployments that want to set a specific user id can preserve permissions +# by adding the `--group-add users` line to `docker run`. + +# uses find to avoid touching files that already have the right permissions, +# which would cause massive image explosion + +# right permissions are: +# group=${NB_GID} +# AND permissions include group rwX (directory-execute) +# AND directories have setuid,setgid bits set + +set -e + +for d in "$@"; do + find "${d}" \ + ! \( \ + -group "${NB_GID}" \ + -a -perm -g+rwX \ + \) \ + -exec chgrp "${NB_GID}" {} \; \ + -exec chmod g+rwX {} \; + # setuid, setgid *on directories only* + find "${d}" \ + \( \ + -type d \ + -a ! -perm -6000 \ + \) \ + -exec chmod +6000 {} \; +done \ No newline at end of file diff --git a/examples/spark-image/jupyter_notebook_config.py b/examples/spark-image/jupyter_notebook_config.py new file mode 100644 index 0000000..ba2c78c --- /dev/null +++ b/examples/spark-image/jupyter_notebook_config.py @@ -0,0 +1,64 @@ +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +from jupyter_core.paths import jupyter_data_dir +import subprocess +import os +import errno +import stat + +c = get_config() # noqa: F821 +c.NotebookApp.ip = "0.0.0.0" +c.NotebookApp.port = 8888 +c.NotebookApp.open_browser = False + +# https://github.com/jupyter/notebook/issues/3130 +c.FileContentsManager.delete_to_trash = False + +# Generate a self-signed certificate +if "GEN_CERT" in os.environ: + dir_name = jupyter_data_dir() + pem_file = os.path.join(dir_name, "notebook.pem") + try: + os.makedirs(dir_name) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(dir_name): + pass + else: + raise + + # Generate an openssl.cnf file to set the distinguished name + cnf_file = os.path.join(os.getenv("CONDA_DIR", "/usr/lib"), "ssl", "openssl.cnf") + if not os.path.isfile(cnf_file): + with open(cnf_file, "w") as fh: + fh.write( + """\ +[req] +distinguished_name = req_distinguished_name +[req_distinguished_name] +""" + ) + + # Generate a certificate if one doesn't exist on disk + subprocess.check_call( + [ + "openssl", + "req", + "-new", + "-newkey=rsa:2048", + "-days=365", + "-nodes", + "-x509", + "-subj=/C=XX/ST=XX/L=XX/O=generated/CN=generated", + f"-keyout={pem_file}", + f"-out={pem_file}", + ] + ) + # Restrict access to the file + os.chmod(pem_file, stat.S_IRUSR | stat.S_IWUSR) + c.NotebookApp.certfile = pem_file + +# Change default umask for all subprocesses of the notebook server if set in +# the environment +if "NB_UMASK" in os.environ: + os.umask(int(os.environ["NB_UMASK"], 8)) \ No newline at end of file diff --git a/examples/spark-image/pyspark-kernel-file-read.ipynb b/examples/spark-image/pyspark-kernel-file-read.ipynb new file mode 100644 index 0000000..b159107 --- /dev/null +++ b/examples/spark-image/pyspark-kernel-file-read.ipynb @@ -0,0 +1,70 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!/usr/local/bin/fix-host-settings " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "import sagemaker_pyspark\n", + "import boto3\n", + "\n", + "# Configure Spark to use the SageMaker Spark dependency jars\n", + "classpath = \":\".join(sagemaker_pyspark.classpath_jars())\n", + "spark = SparkSession.builder.config(\"spark.driver.extraClassPath\", classpath).getOrCreate()\n", + "\n", + "\n", + "# This option is to read obecjts using long term credentials\n", + "# spark._jsc.hadoopConfiguration().set(\"fs.s3a.aws.credentials.provider\",\"org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider\")\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.access.key', 'accesskey')\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.secret.key', 'sceretkey')\n", + "\n", + "# This option is to read obecjts using short term credentials\n", + "# spark._jsc.hadoopConfiguration().set(\"fs.s3a.aws.credentials.provider\",\"org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider\")\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.access.key', 'accesskey')\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.secret.key', 'sceretkey')\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.session.token', 'sessiontoken')\n", + "\n", + "\n", + "# This option is to read public readable objects/buckets\n", + "spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider')\n", + "\n", + "df = spark.read.json(\"s3a://awsglue-datasets/examples/us-legislators/all/organizations.json\")\n", + "df.printSchema()\n", + "df.show()" + ] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "ask (ask/1)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:694846537485:image-version/ask/1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/spark-image/start-notebook.sh b/examples/spark-image/start-notebook.sh new file mode 100644 index 0000000..5d45e73 --- /dev/null +++ b/examples/spark-image/start-notebook.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +set -e + +wrapper="" +if [[ "${RESTARTABLE}" == "yes" ]]; then + wrapper="run-one-constantly" +fi + +if [[ -n "${JUPYTERHUB_API_TOKEN}" ]]; then + # launched by JupyterHub, use single-user entrypoint + exec /usr/local/bin/start-singleuser.sh "$@" +elif [[ -n "${JUPYTER_ENABLE_LAB}" ]]; then + # shellcheck disable=SC1091 + . /usr/local/bin/start.sh ${wrapper} jupyter lab "$@" +else + echo "WARN: Jupyter Notebook deprecation notice https://github.com/jupyter/docker-stacks#jupyter-notebook-deprecation-notice." + # shellcheck disable=SC1091 + . /usr/local/bin/start.sh ${wrapper} jupyter notebook "$@" +fi \ No newline at end of file diff --git a/examples/spark-image/start-singleuser.sh b/examples/spark-image/start-singleuser.sh new file mode 100644 index 0000000..ac92359 --- /dev/null +++ b/examples/spark-image/start-singleuser.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +set -e + +# set default ip to 0.0.0.0 +if [[ "${NOTEBOOK_ARGS} $*" != *"--ip="* ]]; then + NOTEBOOK_ARGS="--ip=0.0.0.0 ${NOTEBOOK_ARGS}" +fi + +# handle some deprecated environment variables +# from DockerSpawner < 0.8. +# These won't be passed from DockerSpawner 0.9, +# so avoid specifying --arg=empty-string +if [ -n "${NOTEBOOK_DIR}" ]; then + # shellcheck disable=SC2089 + NOTEBOOK_ARGS="--notebook-dir='${NOTEBOOK_DIR}' ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_PORT}" ]; then + NOTEBOOK_ARGS="--port=${JPY_PORT} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_USER}" ]; then + NOTEBOOK_ARGS="--user=${JPY_USER} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_COOKIE_NAME}" ]; then + NOTEBOOK_ARGS="--cookie-name=${JPY_COOKIE_NAME} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_BASE_URL}" ]; then + NOTEBOOK_ARGS="--base-url=${JPY_BASE_URL} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_HUB_PREFIX}" ]; then + NOTEBOOK_ARGS="--hub-prefix=${JPY_HUB_PREFIX} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_HUB_API_URL}" ]; then + NOTEBOOK_ARGS="--hub-api-url=${JPY_HUB_API_URL} ${NOTEBOOK_ARGS}" +fi +NOTEBOOK_BIN="jupyterhub-singleuser" + +# shellcheck disable=SC1091,SC2086,SC2090 +. /usr/local/bin/start.sh "${NOTEBOOK_BIN}" ${NOTEBOOK_ARGS} "$@" \ No newline at end of file diff --git a/examples/spark-image/start.sh b/examples/spark-image/start.sh new file mode 100644 index 0000000..c46005c --- /dev/null +++ b/examples/spark-image/start.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +set -e + +# Exec the specified command or fall back on bash +if [ $# -eq 0 ]; then + cmd=( "bash" ) +else + cmd=( "$@" ) +fi + +run-hooks () { + # Source scripts or run executable files in a directory + if [[ ! -d "${1}" ]] ; then + return + fi + echo "${0}: running hooks in ${1}" + for f in "${1}/"*; do + case "${f}" in + *.sh) + echo "${0}: running ${f}" + # shellcheck disable=SC1090 + source "${f}" + ;; + *) + if [[ -x "${f}" ]] ; then + echo "${0}: running ${f}" + "${f}" + else + echo "${0}: ignoring ${f}" + fi + ;; + esac + done + echo "${0}: done running hooks in ${1}" +} + +run-hooks /usr/local/bin/start-notebook.d + +# Handle special flags if we're root +if [ "$(id -u)" == 0 ] ; then + + # Only attempt to change the jovyan username if it exists + if id jovyan &> /dev/null ; then + echo "Set username to: ${NB_USER}" + usermod -d "/home/${NB_USER}" -l "${NB_USER}" jovyan + fi + + # handle home and working directory if the username changed + if [[ "${NB_USER}" != "jovyan" ]]; then + # changing username, make sure homedir exists + # (it could be mounted, and we shouldn't create it if it already exists) + if [[ ! -e "/home/${NB_USER}" ]]; then + echo "Relocating home dir to /home/${NB_USER}" + mv /home/jovyan "/home/${NB_USER}" || ln -s /home/jovyan "/home/${NB_USER}" + fi + # if workdir is in /home/jovyan, cd to /home/${NB_USER} + if [[ "${PWD}/" == "/home/jovyan/"* ]]; then + newcwd="/home/${NB_USER}/${PWD:13}" + echo "Setting CWD to ${newcwd}" + cd "${newcwd}" + fi + fi + + # Handle case where provisioned storage does not have the correct permissions by default + # Ex: default NFS/EFS (no auto-uid/gid) + if [[ "${CHOWN_HOME}" == "1" || "${CHOWN_HOME}" == 'yes' ]]; then + echo "Changing ownership of /home/${NB_USER} to ${NB_UID}:${NB_GID} with options '${CHOWN_HOME_OPTS}'" + # shellcheck disable=SC2086 + chown ${CHOWN_HOME_OPTS} "${NB_UID}:${NB_GID}" "/home/${NB_USER}" + fi + if [ -n "${CHOWN_EXTRA}" ]; then + for extra_dir in $(echo "${CHOWN_EXTRA}" | tr ',' ' '); do + echo "Changing ownership of ${extra_dir} to ${NB_UID}:${NB_GID} with options '${CHOWN_EXTRA_OPTS}'" + # shellcheck disable=SC2086 + chown ${CHOWN_EXTRA_OPTS} "${NB_UID}:${NB_GID}" "${extra_dir}" + done + fi + + # Change UID:GID of NB_USER to NB_UID:NB_GID if it does not match + if [ "${NB_UID}" != "$(id -u "${NB_USER}")" ] || [ "${NB_GID}" != "$(id -g "${NB_USER}")" ]; then + echo "Set user ${NB_USER} UID:GID to: ${NB_UID}:${NB_GID}" + if [ "${NB_GID}" != "$(id -g "${NB_USER}")" ]; then + groupadd -f -g "${NB_GID}" -o "${NB_GROUP:-${NB_USER}}" + fi + userdel "${NB_USER}" + useradd --home "/home/${NB_USER}" -u "${NB_UID}" -g "${NB_GID}" -G 100 -l "${NB_USER}" + fi + + # Enable sudo if requested + if [[ "${GRANT_SUDO}" == "1" || "${GRANT_SUDO}" == 'yes' ]]; then + echo "Granting ${NB_USER} sudo access and appending ${CONDA_DIR}/bin to sudo PATH" + echo "${NB_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/notebook + fi + + # Add ${CONDA_DIR}/bin to sudo secure_path + sed -r "s#Defaults\s+secure_path\s*=\s*\"?([^\"]+)\"?#Defaults secure_path=\"\1:${CONDA_DIR}/bin\"#" /etc/sudoers | grep secure_path > /etc/sudoers.d/path + + # Exec the command as NB_USER with the PATH and the rest of + # the environment preserved + run-hooks /usr/local/bin/before-notebook.d + echo "Executing the command:" "${cmd[@]}" + exec sudo -E -H -u "${NB_USER}" PATH="${PATH}" XDG_CACHE_HOME="/home/${NB_USER}/.cache" PYTHONPATH="${PYTHONPATH:-}" "${cmd[@]}" +else + if [[ "${NB_UID}" == "$(id -u jovyan 2>/dev/null)" && "${NB_GID}" == "$(id -g jovyan 2>/dev/null)" ]]; then + # User is not attempting to override user/group via environment + # variables, but they could still have overridden the uid/gid that + # container runs as. Check that the user has an entry in the passwd + # file and if not add an entry. + STATUS=0 && whoami &> /dev/null || STATUS=$? && true + if [[ "${STATUS}" != "0" ]]; then + if [[ -w /etc/passwd ]]; then + echo "Adding passwd file entry for $(id -u)" + sed -e "s/^jovyan:/nayvoj:/" /etc/passwd > /tmp/passwd + echo "jovyan:x:$(id -u):$(id -g):,,,:/home/jovyan:/bin/bash" >> /tmp/passwd + cat /tmp/passwd > /etc/passwd + rm /tmp/passwd + else + echo 'Container must be run with group "root" to update passwd file' + fi + fi + + # Warn if the user isn't going to be able to write files to ${HOME}. + if [[ ! -w /home/jovyan ]]; then + echo 'Container must be run with group "users" to update files' + fi + else + # Warn if looks like user want to override uid/gid but hasn't + # run the container as root. + if [[ -n "${NB_UID}" && "${NB_UID}" != "$(id -u)" ]]; then + echo "Container must be run as root to set NB_UID to ${NB_UID}" + fi + if [[ -n "${NB_GID}" && "${NB_GID}" != "$(id -g)" ]]; then + echo "Container must be run as root to set NB_GID to ${NB_GID}" + fi + fi + + # Warn if looks like user want to run in sudo mode but hasn't run + # the container as root. + if [[ "${GRANT_SUDO}" == "1" || "${GRANT_SUDO}" == 'yes' ]]; then + echo 'Container must be run as root to grant sudo permissions' + fi + + # Execute the command + run-hooks /usr/local/bin/before-notebook.d + echo "Executing the command:" "${cmd[@]}" + exec "${cmd[@]}" +fi \ No newline at end of file diff --git a/examples/spark-image/update-domain-input.json b/examples/spark-image/update-domain-input.json new file mode 100644 index 0000000..b0e504e --- /dev/null +++ b/examples/spark-image/update-domain-input.json @@ -0,0 +1,13 @@ +{ + "DomainId": "", + "DefaultUserSettings": { + "KernelGatewayAppSettings": { + "CustomImages": [ + { + "ImageName": "spark-kernel", + "AppImageConfigName": "custom-spark-image-config" + } + ] + } + } +}