From a3e5de87e2befbdf8685a107239ba8ee8666c8f3 Mon Sep 17 00:00:00 2001 From: ajeettewari Date: Sat, 17 Jul 2021 20:37:33 +0200 Subject: [PATCH 1/6] added spark image --- .DS_Store | Bin 0 -> 6148 bytes README.md | 10 +- examples/.DS_Store | Bin 0 -> 6148 bytes examples/echo-kernel-image/Dockerfile.orig | 38 ++++ examples/spark-image/.DS_Store | Bin 0 -> 6148 bytes examples/spark-image/Dockerfile | 165 ++++++++++++++++++ examples/spark-image/README.md | 67 +++++++ .../spark-image/app-image-config-input.json | 16 ++ examples/spark-image/create-domain-input.json | 19 ++ examples/spark-image/fix-host-settings | 4 + examples/spark-image/fix-permissions | 35 ++++ .../spark-image/jupyter_notebook_config.py | 64 +++++++ .../pyspark-kernel-file-read.ipynb | 70 ++++++++ examples/spark-image/start-notebook.sh | 22 +++ examples/spark-image/start-singleuser.sh | 41 +++++ examples/spark-image/start.sh | 150 ++++++++++++++++ examples/spark-image/update-domain-input.json | 13 ++ 17 files changed, 710 insertions(+), 4 deletions(-) create mode 100644 .DS_Store create mode 100644 examples/.DS_Store create mode 100644 examples/echo-kernel-image/Dockerfile.orig create mode 100644 examples/spark-image/.DS_Store create mode 100644 examples/spark-image/Dockerfile create mode 100644 examples/spark-image/README.md create mode 100644 examples/spark-image/app-image-config-input.json create mode 100644 examples/spark-image/create-domain-input.json create mode 100644 examples/spark-image/fix-host-settings create mode 100644 examples/spark-image/fix-permissions create mode 100644 examples/spark-image/jupyter_notebook_config.py create mode 100644 examples/spark-image/pyspark-kernel-file-read.ipynb create mode 100644 examples/spark-image/start-notebook.sh create mode 100644 examples/spark-image/start-singleuser.sh create mode 100644 examples/spark-image/start.sh create mode 100644 examples/spark-image/update-domain-input.json diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d98352b4e543a45fbdb006a4b075e3f3a8a549c0 GIT binary patch literal 6148 zcmeHKJ8l9o5Sel$5oG9~S-p|Q&mPaS zmDrVEV aws --region ${REGION} ecr create-repository \ --repository-name smstudio-custom @@ -29,4 +31,4 @@ See [DEVELOPMENT.md](DEVELOPMENT.md) ### License -This sample code is licensed under the MIT-0 License. See the LICENSE file. \ No newline at end of file +This sample code is licensed under the MIT-0 License. See the LICENSE file. diff --git a/examples/.DS_Store b/examples/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9a273583ce0f191775b0ce73d66e9efc0f25beae GIT binary patch literal 6148 zcmeHKJ5EC}5S)b+kSwmi=$=s(>y)sLKDqO>#^56 zwmik#w*YK)+};680CTz{zI~XQ@4HXzsv<_D^Ne@g;td~o#Ooyc_keR3IN+0TIIp#hzM*^058aLPpm2zCOpJ2O gg_q--NXoqCbME)TAu;HT2c4*&0oO$)1^!!s6Z8rda{vGU literal 0 HcmV?d00001 diff --git a/examples/echo-kernel-image/Dockerfile.orig b/examples/echo-kernel-image/Dockerfile.orig new file mode 100644 index 0000000..8274702 --- /dev/null +++ b/examples/echo-kernel-image/Dockerfile.orig @@ -0,0 +1,38 @@ +FROM python:3.6 + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + + +###################### +# OVERVIEW +# 1. Creates the `sagemaker-user` user with UID/GID 1000/100. +# 2. Ensures this user can `sudo` by default. +# 3. Install the echo kernel from PyPI and install its dependencies. +# 4. Make the default shell `bash`. This enhances the experience inside a Jupyter terminal as otherwise Jupyter defaults to `sh` +###################### + +# Setup the "sagemaker-user" user with root privileges. +RUN \ + apt-get update && \ + apt-get install -y sudo && \ + useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \ + chmod g+w /etc/passwd && \ + echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + # Prevent apt-get cache from being persisted to this layer. + rm -rf /var/lib/apt/lists/* + +# Install and configure the kernel. +RUN \ + pip install echo_kernel \ + # These are dependencies of echo_kernel but the version on PyPI is old and doesn't declare them correctly. + jupyter_client IPython ipykernel && \ + # This ensures that the kernelspec.json is installed in location expected by Jupyter/KernelGateway. + python -m echo_kernel.install --sys-prefix + +# Make the default shell bash (vs "sh") for a better Jupyter terminal UX +ENV SHELL=/bin/bash + +USER $NB_UID + diff --git a/examples/spark-image/.DS_Store b/examples/spark-image/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..c4c641ba8675cf43a016c7e2fe00567143e7b961 GIT binary patch literal 6148 zcmeHKyG{c^3>-s>NHl4XDE9{_@&~ItiWF4$0|bIn5Oh%<{Z;ucK8^9CIUO{SXd+v( zXV>f5(@k+c1CZHyGX*99#&ktoGz?A8)dzML86nbn#~y1O@QG*42i3nPl>3MVFXS{m z{uxJ{(Bg|HE55v#cB%UlW zp@==5_ZKUNRL6{|Kq}Bz;NG_r?f*;qAM<~ol)F?Q6}Tw{WVC1&Grm&v*1^kZuPyXf vy4HNt)wnhagSBI#wPS9y9Y0x-*A-p!yaslTMrYpW#QYghUD8s4|5o4!EfW}; literal 0 HcmV?d00001 diff --git a/examples/spark-image/Dockerfile b/examples/spark-image/Dockerfile new file mode 100644 index 0000000..22fc7b1 --- /dev/null +++ b/examples/spark-image/Dockerfile @@ -0,0 +1,165 @@ +FROM ubuntu:18.04 + +ARG NB_USER="sagemaker-user" +ARG NB_UID="1000" +ARG NB_GID="100" + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +# Setup the "sagemaker-user" user with root privileges. +RUN apt-get update && \ + apt-get install -y sudo && \ + useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \ + chmod g+w /etc/passwd && \ + echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + # Prevent apt-get cache from being persisted to this layer. + apt-get clean && rm -rf /var/lib/apt/lists/* && \ + echo "en_US.UTF-8 UTF-8" > /etc/locale.gen + +USER $NB_UID + +# Make the default shell bash (vs "sh") for a better Jupyter terminal UX +ENV SHELL=/bin/bash \ + NB_USER=$NB_USER \ + NB_UID=$NB_UID \ + NB_GID=$NB_GID \ + LC_ALL=en_US.UTF-8 \ + LANG=en_US.UTF-8 \ + LANGUAGE=en_US.UTF-8 \ + HOME=/home/$NB_USER \ + MINICONDA_VERSION=4.6.14 \ + CONDA_VERSION=4.6.14 \ + MINICONDA_MD5=718259965f234088d785cad1fbd7de03 \ + CONDA_DIR=/opt/conda \ + PATH=$CONDA_DIR/bin:${PATH} + + +USER root +RUN apt-get update --yes +# COPY ./apt-packages.txt /root/apt-packages.txt +# RUN xargs -a /root/apt-packages.txt apt-get install -y --no-install-recommends + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + wget \ + curl \ + ca-certificates \ + sudo \ + locales \ + fonts-liberation \ + run-one && \ + apt-get clean && rm -rf /var/lib/apt/lists/* && \ + echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && \ + locale-gen + +RUN mkdir -p $CONDA_DIR && \ + chown -R $NB_USER:$NB_GID $CONDA_DIR && \ + # Fix for devtools https://github.com/conda-forge/r-devtools-feedstock/issues/4 + ln -s /bin/tar /bin/gtar + +# Copy a script that we will use to correct permissions after running certain commands +COPY fix-permissions /usr/local/bin/fix-permissions +RUN chmod a+rx /usr/local/bin/fix-permissions +COPY fix-host-settings /usr/local/bin/fix-host-settings +RUN chmod a+rx /usr/local/bin/fix-host-settings + +RUN echo "auth requisite pam_deny.so" >> /etc/pam.d/su && \ + sed -i.bak -e 's/^%admin/#%admin/' /etc/sudoers && \ + sed -i.bak -e 's/^%sudo/#%sudo/' /etc/sudoers && \ + usermod -G root ${NB_USER} && \ + mkdir -p "${CONDA_DIR}" && \ + chown "${NB_USER}:${NB_GID}" "${CONDA_DIR}" && \ + chmod g+w /etc/passwd && \ + fix-permissions "${HOME}" && \ + fix-permissions "${CONDA_DIR}" + +USER ${NB_UID} +ARG PYTHON_VERSION=3.6.14 +ENV PATH=$CONDA_DIR/bin:${PATH} +WORKDIR /tmp + +# Install conda via Miniconda +RUN curl --silent --show-error --output miniconda-installer.sh https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh && \ + echo "${MINICONDA_MD5} *miniconda-installer.sh" | md5sum -c - && \ + /bin/bash miniconda-installer.sh -f -b -p $CONDA_DIR && \ + rm miniconda-installer.sh && \ + conda config --system --prepend channels conda-forge && \ + conda config --system --set auto_update_conda false && \ + conda config --system --set show_channel_urls true && \ + conda config --system --set pip_interop_enabled true && \ + conda install --quiet --yes conda="${CONDA_VERSION%.*}.*" && \ + conda update --all --quiet --yes && \ + conda clean --all -f -y && \ + rm -rf /home/$NB_USER/.cache/yarn + +RUN conda install --quiet --yes \ + tini \ + boto3 \ + 'awscli>=1.18' \ + sagemaker_pyspark \ + 'pyspark==2.4.0' \ + 'notebook=6.4.0' \ + 'jupyterhub=1.4.1' \ + 'jupyterlab=3.0.16' && \ + conda clean --all -f -y && \ + npm cache clean --force && \ + jupyter notebook --generate-config && \ + jupyter lab clean && \ + rm -rf "/home/${NB_USER}/.cache/yarn" && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +EXPOSE 8888 + +# Copy local files as late as possible to avoid cache busting +COPY start.sh start-notebook.sh start-singleuser.sh /usr/local/bin/ +# Currently need to have both jupyter_notebook_config and jupyter_server_config to support classic and lab +COPY jupyter_notebook_config.py /etc/jupyter/ + +# Fix permissions on /etc/jupyter as root +USER root + +# Prepare upgrade to JupyterLab V3.0 #1205 +RUN sed -re "s/c.NotebookApp/c.ServerApp/g" \ + /etc/jupyter/jupyter_notebook_config.py > /etc/jupyter/jupyter_server_config.py && \ + fix-permissions /etc/jupyter/ + +# Spark dependencies +# Default values can be overridden at build time +# (ARGS are in lower case to distinguish them from ENV) +ARG spark_version="2.4.0" +ARG hadoop_version="2.7" +ARG spark_checksum="5F4184E0FE7E5C8AE67F5E6BC5DEEE881051CC712E9FF8AEDDF3529724C00E402C94BB75561DD9517A372F06C1FCB78DC7AE65DCBD4C156B3BA4D8E267EC2936" +ARG openjdk_version="8" + +ENV APACHE_SPARK_VERSION="${spark_version}" \ + HADOOP_VERSION="${hadoop_version}" + +RUN apt-get update --yes && \ + apt-get install --yes --no-install-recommends \ + "openjdk-${openjdk_version}-jre-headless" \ + ca-certificates-java && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Spark installation +WORKDIR /tmp +RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ + echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \ + tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \ + rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" + +WORKDIR /usr/local + +# Configure Spark +ENV SPARK_HOME=/usr/local/spark +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \ + PATH="${PATH}:${SPARK_HOME}/bin" + +RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark && \ + # Add a link in the before_notebook hook in order to source automatically PYTHONPATH + mkdir -p /usr/local/bin/before-notebook.d && \ + ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh + +USER ${NB_UID} + +WORKDIR $HOME diff --git a/examples/spark-image/README.md b/examples/spark-image/README.md new file mode 100644 index 0000000..11ba33b --- /dev/null +++ b/examples/spark-image/README.md @@ -0,0 +1,67 @@ +# Spark Image + +## Overview + +This image is Spark kernel as a Custom Image in SageMaker Studio. This custom image can be used to do interactive spark development in Python. This allow to read and write data from Amazon S3 buckets. + +The image is based on Spark version 2.4.0, Hadoop version 2.7 and openjdk 8. This image also have latest version of sagemaker_pyspark (1.4.2), to have aws-hadoop and other dependents jar, to work with AWS services like Amazon simple storage service (s3). + +Example notebook (pyspark-kernel-file-read) explains on how to use different credential provider to make calls to S3. + +### Building the image + +Build the Docker image and push to Amazon ECR. + +```bash +# Modify these as required. The Docker registry endpoint can be tuned based on your current region from https://docs.aws.amazon.com/general/latest/gr/ecr.html#ecr-docker-endpoints +REGION= +ACCOUNT_ID= + + +# Build the image +IMAGE_NAME=spark-kernel +aws --region ${REGION} ecr get-login-password | docker login --username AWS --password-stdin ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom +docker build . -t ${IMAGE_NAME} -t ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME} +``` + +```bash +docker push ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME} +``` + +### Using with SageMaker Studio + +Create a SageMaker Image with the image in ECR. + +```bash +# Role in your account to be used for the SageMaker Image +ROLE_ARN= + +aws --region ${REGION} sagemaker create-image \ + --image-name ${IMAGE_NAME} \ + --role-arn ${ROLE_ARN} + +aws --region ${REGION} sagemaker create-image-version \ + --image-name ${IMAGE_NAME} \ + --base-image "${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/smstudio-custom:${IMAGE_NAME}" + +# Verify the image-version is created successfully. Do NOT proceed if image-version is in CREATE_FAILED state or in any other state apart from CREATED. +aws --region ${REGION} sagemaker describe-image-version --image-name ${IMAGE_NAME} +``` + +Create a AppImageConfig for this image + +```bash +aws --region ${REGION} sagemaker create-app-image-config --cli-input-json file://app-image-config-input.json +``` + +Create a Domain, providing the SageMaker Image and AppImageConfig in the Domain creation. Replace the placeholders for VPC ID, Subnet IDs, and Execution Role in `create-domain-input.json` + +```bash +aws --region ${REGION} sagemaker create-domain --cli-input-json file://create-domain-input.json +``` + +If you have an existing Domain, you can also use the `update-domain` + +```bash +aws --region ${REGION} sagemaker update-domain --cli-input-json file://update-domain-input.json +``` diff --git a/examples/spark-image/app-image-config-input.json b/examples/spark-image/app-image-config-input.json new file mode 100644 index 0000000..58851f2 --- /dev/null +++ b/examples/spark-image/app-image-config-input.json @@ -0,0 +1,16 @@ +{ + "AppImageConfigName": "custom-spark-image-config", + "KernelGatewayImageConfig": { + "KernelSpecs": [ + { + "Name": "python3", + "DisplayName": "spark" + } + ], + "FileSystemConfig": { + "MountPath": "/home/sagemaker-user", + "DefaultUid": 1000, + "DefaultGid": 100 + } + } +} \ No newline at end of file diff --git a/examples/spark-image/create-domain-input.json b/examples/spark-image/create-domain-input.json new file mode 100644 index 0000000..d3382ee --- /dev/null +++ b/examples/spark-image/create-domain-input.json @@ -0,0 +1,19 @@ +{ + "DomainName": "domain-with-spark-kernel-image", + "VpcId": "", + "SubnetIds": [ + "" + ], + "DefaultUserSettings": { + "ExecutionRole": "", + "KernelGatewayAppSettings": { + "CustomImages": [ + { + "ImageName": "spark-kernel", + "AppImageConfigName": "custom-spark-image-config" + } + ] + } + }, + "AuthMode": "IAM" +} \ No newline at end of file diff --git a/examples/spark-image/fix-host-settings b/examples/spark-image/fix-host-settings new file mode 100644 index 0000000..3f7a128 --- /dev/null +++ b/examples/spark-image/fix-host-settings @@ -0,0 +1,4 @@ +#!/bin/sh +sudo -i chmod 777 /etc/hosts > /dev/null 2>&1 +if !(grep -qi $HOSTNAME /etc/hosts); then echo "127.0.0.1 ${HOSTNAME}" >> /etc/hosts; fi +cat /etc/hosts \ No newline at end of file diff --git a/examples/spark-image/fix-permissions b/examples/spark-image/fix-permissions new file mode 100644 index 0000000..0969275 --- /dev/null +++ b/examples/spark-image/fix-permissions @@ -0,0 +1,35 @@ +#!/bin/bash +# set permissions on a directory +# after any installation, if a directory needs to be (human) user-writable, +# run this script on it. +# It will make everything in the directory owned by the group ${NB_GID} +# and writable by that group. +# Deployments that want to set a specific user id can preserve permissions +# by adding the `--group-add users` line to `docker run`. + +# uses find to avoid touching files that already have the right permissions, +# which would cause massive image explosion + +# right permissions are: +# group=${NB_GID} +# AND permissions include group rwX (directory-execute) +# AND directories have setuid,setgid bits set + +set -e + +for d in "$@"; do + find "${d}" \ + ! \( \ + -group "${NB_GID}" \ + -a -perm -g+rwX \ + \) \ + -exec chgrp "${NB_GID}" {} \; \ + -exec chmod g+rwX {} \; + # setuid, setgid *on directories only* + find "${d}" \ + \( \ + -type d \ + -a ! -perm -6000 \ + \) \ + -exec chmod +6000 {} \; +done \ No newline at end of file diff --git a/examples/spark-image/jupyter_notebook_config.py b/examples/spark-image/jupyter_notebook_config.py new file mode 100644 index 0000000..ba2c78c --- /dev/null +++ b/examples/spark-image/jupyter_notebook_config.py @@ -0,0 +1,64 @@ +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +from jupyter_core.paths import jupyter_data_dir +import subprocess +import os +import errno +import stat + +c = get_config() # noqa: F821 +c.NotebookApp.ip = "0.0.0.0" +c.NotebookApp.port = 8888 +c.NotebookApp.open_browser = False + +# https://github.com/jupyter/notebook/issues/3130 +c.FileContentsManager.delete_to_trash = False + +# Generate a self-signed certificate +if "GEN_CERT" in os.environ: + dir_name = jupyter_data_dir() + pem_file = os.path.join(dir_name, "notebook.pem") + try: + os.makedirs(dir_name) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(dir_name): + pass + else: + raise + + # Generate an openssl.cnf file to set the distinguished name + cnf_file = os.path.join(os.getenv("CONDA_DIR", "/usr/lib"), "ssl", "openssl.cnf") + if not os.path.isfile(cnf_file): + with open(cnf_file, "w") as fh: + fh.write( + """\ +[req] +distinguished_name = req_distinguished_name +[req_distinguished_name] +""" + ) + + # Generate a certificate if one doesn't exist on disk + subprocess.check_call( + [ + "openssl", + "req", + "-new", + "-newkey=rsa:2048", + "-days=365", + "-nodes", + "-x509", + "-subj=/C=XX/ST=XX/L=XX/O=generated/CN=generated", + f"-keyout={pem_file}", + f"-out={pem_file}", + ] + ) + # Restrict access to the file + os.chmod(pem_file, stat.S_IRUSR | stat.S_IWUSR) + c.NotebookApp.certfile = pem_file + +# Change default umask for all subprocesses of the notebook server if set in +# the environment +if "NB_UMASK" in os.environ: + os.umask(int(os.environ["NB_UMASK"], 8)) \ No newline at end of file diff --git a/examples/spark-image/pyspark-kernel-file-read.ipynb b/examples/spark-image/pyspark-kernel-file-read.ipynb new file mode 100644 index 0000000..b159107 --- /dev/null +++ b/examples/spark-image/pyspark-kernel-file-read.ipynb @@ -0,0 +1,70 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!/usr/local/bin/fix-host-settings " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "import sagemaker_pyspark\n", + "import boto3\n", + "\n", + "# Configure Spark to use the SageMaker Spark dependency jars\n", + "classpath = \":\".join(sagemaker_pyspark.classpath_jars())\n", + "spark = SparkSession.builder.config(\"spark.driver.extraClassPath\", classpath).getOrCreate()\n", + "\n", + "\n", + "# This option is to read obecjts using long term credentials\n", + "# spark._jsc.hadoopConfiguration().set(\"fs.s3a.aws.credentials.provider\",\"org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider\")\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.access.key', 'accesskey')\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.secret.key', 'sceretkey')\n", + "\n", + "# This option is to read obecjts using short term credentials\n", + "# spark._jsc.hadoopConfiguration().set(\"fs.s3a.aws.credentials.provider\",\"org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider\")\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.access.key', 'accesskey')\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.secret.key', 'sceretkey')\n", + "# spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.session.token', 'sessiontoken')\n", + "\n", + "\n", + "# This option is to read public readable objects/buckets\n", + "spark.sparkContext._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider')\n", + "\n", + "df = spark.read.json(\"s3a://awsglue-datasets/examples/us-legislators/all/organizations.json\")\n", + "df.printSchema()\n", + "df.show()" + ] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "ask (ask/1)", + "language": "python", + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:694846537485:image-version/ask/1" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/spark-image/start-notebook.sh b/examples/spark-image/start-notebook.sh new file mode 100644 index 0000000..5d45e73 --- /dev/null +++ b/examples/spark-image/start-notebook.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +set -e + +wrapper="" +if [[ "${RESTARTABLE}" == "yes" ]]; then + wrapper="run-one-constantly" +fi + +if [[ -n "${JUPYTERHUB_API_TOKEN}" ]]; then + # launched by JupyterHub, use single-user entrypoint + exec /usr/local/bin/start-singleuser.sh "$@" +elif [[ -n "${JUPYTER_ENABLE_LAB}" ]]; then + # shellcheck disable=SC1091 + . /usr/local/bin/start.sh ${wrapper} jupyter lab "$@" +else + echo "WARN: Jupyter Notebook deprecation notice https://github.com/jupyter/docker-stacks#jupyter-notebook-deprecation-notice." + # shellcheck disable=SC1091 + . /usr/local/bin/start.sh ${wrapper} jupyter notebook "$@" +fi \ No newline at end of file diff --git a/examples/spark-image/start-singleuser.sh b/examples/spark-image/start-singleuser.sh new file mode 100644 index 0000000..ac92359 --- /dev/null +++ b/examples/spark-image/start-singleuser.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +set -e + +# set default ip to 0.0.0.0 +if [[ "${NOTEBOOK_ARGS} $*" != *"--ip="* ]]; then + NOTEBOOK_ARGS="--ip=0.0.0.0 ${NOTEBOOK_ARGS}" +fi + +# handle some deprecated environment variables +# from DockerSpawner < 0.8. +# These won't be passed from DockerSpawner 0.9, +# so avoid specifying --arg=empty-string +if [ -n "${NOTEBOOK_DIR}" ]; then + # shellcheck disable=SC2089 + NOTEBOOK_ARGS="--notebook-dir='${NOTEBOOK_DIR}' ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_PORT}" ]; then + NOTEBOOK_ARGS="--port=${JPY_PORT} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_USER}" ]; then + NOTEBOOK_ARGS="--user=${JPY_USER} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_COOKIE_NAME}" ]; then + NOTEBOOK_ARGS="--cookie-name=${JPY_COOKIE_NAME} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_BASE_URL}" ]; then + NOTEBOOK_ARGS="--base-url=${JPY_BASE_URL} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_HUB_PREFIX}" ]; then + NOTEBOOK_ARGS="--hub-prefix=${JPY_HUB_PREFIX} ${NOTEBOOK_ARGS}" +fi +if [ -n "${JPY_HUB_API_URL}" ]; then + NOTEBOOK_ARGS="--hub-api-url=${JPY_HUB_API_URL} ${NOTEBOOK_ARGS}" +fi +NOTEBOOK_BIN="jupyterhub-singleuser" + +# shellcheck disable=SC1091,SC2086,SC2090 +. /usr/local/bin/start.sh "${NOTEBOOK_BIN}" ${NOTEBOOK_ARGS} "$@" \ No newline at end of file diff --git a/examples/spark-image/start.sh b/examples/spark-image/start.sh new file mode 100644 index 0000000..c46005c --- /dev/null +++ b/examples/spark-image/start.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +set -e + +# Exec the specified command or fall back on bash +if [ $# -eq 0 ]; then + cmd=( "bash" ) +else + cmd=( "$@" ) +fi + +run-hooks () { + # Source scripts or run executable files in a directory + if [[ ! -d "${1}" ]] ; then + return + fi + echo "${0}: running hooks in ${1}" + for f in "${1}/"*; do + case "${f}" in + *.sh) + echo "${0}: running ${f}" + # shellcheck disable=SC1090 + source "${f}" + ;; + *) + if [[ -x "${f}" ]] ; then + echo "${0}: running ${f}" + "${f}" + else + echo "${0}: ignoring ${f}" + fi + ;; + esac + done + echo "${0}: done running hooks in ${1}" +} + +run-hooks /usr/local/bin/start-notebook.d + +# Handle special flags if we're root +if [ "$(id -u)" == 0 ] ; then + + # Only attempt to change the jovyan username if it exists + if id jovyan &> /dev/null ; then + echo "Set username to: ${NB_USER}" + usermod -d "/home/${NB_USER}" -l "${NB_USER}" jovyan + fi + + # handle home and working directory if the username changed + if [[ "${NB_USER}" != "jovyan" ]]; then + # changing username, make sure homedir exists + # (it could be mounted, and we shouldn't create it if it already exists) + if [[ ! -e "/home/${NB_USER}" ]]; then + echo "Relocating home dir to /home/${NB_USER}" + mv /home/jovyan "/home/${NB_USER}" || ln -s /home/jovyan "/home/${NB_USER}" + fi + # if workdir is in /home/jovyan, cd to /home/${NB_USER} + if [[ "${PWD}/" == "/home/jovyan/"* ]]; then + newcwd="/home/${NB_USER}/${PWD:13}" + echo "Setting CWD to ${newcwd}" + cd "${newcwd}" + fi + fi + + # Handle case where provisioned storage does not have the correct permissions by default + # Ex: default NFS/EFS (no auto-uid/gid) + if [[ "${CHOWN_HOME}" == "1" || "${CHOWN_HOME}" == 'yes' ]]; then + echo "Changing ownership of /home/${NB_USER} to ${NB_UID}:${NB_GID} with options '${CHOWN_HOME_OPTS}'" + # shellcheck disable=SC2086 + chown ${CHOWN_HOME_OPTS} "${NB_UID}:${NB_GID}" "/home/${NB_USER}" + fi + if [ -n "${CHOWN_EXTRA}" ]; then + for extra_dir in $(echo "${CHOWN_EXTRA}" | tr ',' ' '); do + echo "Changing ownership of ${extra_dir} to ${NB_UID}:${NB_GID} with options '${CHOWN_EXTRA_OPTS}'" + # shellcheck disable=SC2086 + chown ${CHOWN_EXTRA_OPTS} "${NB_UID}:${NB_GID}" "${extra_dir}" + done + fi + + # Change UID:GID of NB_USER to NB_UID:NB_GID if it does not match + if [ "${NB_UID}" != "$(id -u "${NB_USER}")" ] || [ "${NB_GID}" != "$(id -g "${NB_USER}")" ]; then + echo "Set user ${NB_USER} UID:GID to: ${NB_UID}:${NB_GID}" + if [ "${NB_GID}" != "$(id -g "${NB_USER}")" ]; then + groupadd -f -g "${NB_GID}" -o "${NB_GROUP:-${NB_USER}}" + fi + userdel "${NB_USER}" + useradd --home "/home/${NB_USER}" -u "${NB_UID}" -g "${NB_GID}" -G 100 -l "${NB_USER}" + fi + + # Enable sudo if requested + if [[ "${GRANT_SUDO}" == "1" || "${GRANT_SUDO}" == 'yes' ]]; then + echo "Granting ${NB_USER} sudo access and appending ${CONDA_DIR}/bin to sudo PATH" + echo "${NB_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/notebook + fi + + # Add ${CONDA_DIR}/bin to sudo secure_path + sed -r "s#Defaults\s+secure_path\s*=\s*\"?([^\"]+)\"?#Defaults secure_path=\"\1:${CONDA_DIR}/bin\"#" /etc/sudoers | grep secure_path > /etc/sudoers.d/path + + # Exec the command as NB_USER with the PATH and the rest of + # the environment preserved + run-hooks /usr/local/bin/before-notebook.d + echo "Executing the command:" "${cmd[@]}" + exec sudo -E -H -u "${NB_USER}" PATH="${PATH}" XDG_CACHE_HOME="/home/${NB_USER}/.cache" PYTHONPATH="${PYTHONPATH:-}" "${cmd[@]}" +else + if [[ "${NB_UID}" == "$(id -u jovyan 2>/dev/null)" && "${NB_GID}" == "$(id -g jovyan 2>/dev/null)" ]]; then + # User is not attempting to override user/group via environment + # variables, but they could still have overridden the uid/gid that + # container runs as. Check that the user has an entry in the passwd + # file and if not add an entry. + STATUS=0 && whoami &> /dev/null || STATUS=$? && true + if [[ "${STATUS}" != "0" ]]; then + if [[ -w /etc/passwd ]]; then + echo "Adding passwd file entry for $(id -u)" + sed -e "s/^jovyan:/nayvoj:/" /etc/passwd > /tmp/passwd + echo "jovyan:x:$(id -u):$(id -g):,,,:/home/jovyan:/bin/bash" >> /tmp/passwd + cat /tmp/passwd > /etc/passwd + rm /tmp/passwd + else + echo 'Container must be run with group "root" to update passwd file' + fi + fi + + # Warn if the user isn't going to be able to write files to ${HOME}. + if [[ ! -w /home/jovyan ]]; then + echo 'Container must be run with group "users" to update files' + fi + else + # Warn if looks like user want to override uid/gid but hasn't + # run the container as root. + if [[ -n "${NB_UID}" && "${NB_UID}" != "$(id -u)" ]]; then + echo "Container must be run as root to set NB_UID to ${NB_UID}" + fi + if [[ -n "${NB_GID}" && "${NB_GID}" != "$(id -g)" ]]; then + echo "Container must be run as root to set NB_GID to ${NB_GID}" + fi + fi + + # Warn if looks like user want to run in sudo mode but hasn't run + # the container as root. + if [[ "${GRANT_SUDO}" == "1" || "${GRANT_SUDO}" == 'yes' ]]; then + echo 'Container must be run as root to grant sudo permissions' + fi + + # Execute the command + run-hooks /usr/local/bin/before-notebook.d + echo "Executing the command:" "${cmd[@]}" + exec "${cmd[@]}" +fi \ No newline at end of file diff --git a/examples/spark-image/update-domain-input.json b/examples/spark-image/update-domain-input.json new file mode 100644 index 0000000..b0e504e --- /dev/null +++ b/examples/spark-image/update-domain-input.json @@ -0,0 +1,13 @@ +{ + "DomainId": "", + "DefaultUserSettings": { + "KernelGatewayAppSettings": { + "CustomImages": [ + { + "ImageName": "spark-kernel", + "AppImageConfigName": "custom-spark-image-config" + } + ] + } + } +} From 1f4a29e020924ca588a44e9be41abd39733637cd Mon Sep 17 00:00:00 2001 From: Ajeet Tewari Date: Sat, 17 Jul 2021 20:49:18 +0200 Subject: [PATCH 2/6] Delete .DS_Store --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index d98352b4e543a45fbdb006a4b075e3f3a8a549c0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKJ8l9o5Sel$5oG9~S-p|Q&mPaS zmDrVEV Date: Sat, 17 Jul 2021 20:52:17 +0200 Subject: [PATCH 3/6] .DS_Store Ignored! --- examples/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 examples/.gitignore diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1 @@ +.DS_Store From 7a831dd12d886e32ef4022d681b1793b18a76e70 Mon Sep 17 00:00:00 2001 From: Ajeet Tewari Date: Sat, 17 Jul 2021 20:56:28 +0200 Subject: [PATCH 4/6] Delete .DS_Store .DS_Store Ignored! --- examples/spark-image/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/spark-image/.DS_Store diff --git a/examples/spark-image/.DS_Store b/examples/spark-image/.DS_Store deleted file mode 100644 index c4c641ba8675cf43a016c7e2fe00567143e7b961..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKyG{c^3>-s>NHl4XDE9{_@&~ItiWF4$0|bIn5Oh%<{Z;ucK8^9CIUO{SXd+v( zXV>f5(@k+c1CZHyGX*99#&ktoGz?A8)dzML86nbn#~y1O@QG*42i3nPl>3MVFXS{m z{uxJ{(Bg|HE55v#cB%UlW zp@==5_ZKUNRL6{|Kq}Bz;NG_r?f*;qAM<~ol)F?Q6}Tw{WVC1&Grm&v*1^kZuPyXf vy4HNt)wnhagSBI#wPS9y9Y0x-*A-p!yaslTMrYpW#QYghUD8s4|5o4!EfW}; From 9e66991e51d6e4a7cc01ec641d94ddc4d7f99d56 Mon Sep 17 00:00:00 2001 From: ajeettewari Date: Sat, 17 Jul 2021 21:13:51 +0200 Subject: [PATCH 5/6] .DS_Store Ignored! --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store From 193b86be01492bc5587d41be972ef7a65cc01362 Mon Sep 17 00:00:00 2001 From: ajeettewari Date: Sat, 17 Jul 2021 21:17:10 +0200 Subject: [PATCH 6/6] .DS_Store Ignored! --- examples/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/.DS_Store diff --git a/examples/.DS_Store b/examples/.DS_Store deleted file mode 100644 index 9a273583ce0f191775b0ce73d66e9efc0f25beae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKJ5EC}5S)b+kSwmi=$=s(>y)sLKDqO>#^56 zwmik#w*YK)+};680CTz{zI~XQ@4HXzsv<_D^Ne@g;td~o#Ooyc_keR3IN+0TIIp#hzM*^058aLPpm2zCOpJ2O gg_q--NXoqCbME)TAu;HT2c4*&0oO$)1^!!s6Z8rda{vGU