Merge branch 'master' of https://github.com/iot-salzburg/gpu-jupyter

2020-03-11 07:48:31 +01:00
parent e894fd0e9c b8ad1955e0
commit db349d928b
7 changed files with 111 additions and 159 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -114,3 +114,7 @@ venv.bak/
 src/jupyter_notebook_config.json
 .idea
 Deployment-notes.md
+add-to-swarm-with-defaults.sh
+add-to-swarm.sh
+remove-from-swarm.sh
+docker-compose-swarm.yml
--- a/README.md
+++ b/README.md
@@ -194,15 +194,22 @@ Then re-generate and re-run the image, as closer described above:

 #### Update Docker-Stack

-The [docker-stacks](https://github.com/jupyter/docker-stacks) are used as 
-submodule within `.build/docker-stacks`. To update the generated Dockerfile these run:
+The [docker-stacks](https://github.com/jupyter/docker-stacks) are used as  a
+submodule within `.build/docker-stacks`. Per default, the head of the commit is reset to a stable commit. 
+To update the generated Dockerfile to a specific commit, run:

 ```bash
-cd .build/docker-stacks/ && git pull && cd -
-./generate_Dockerfile.sh
+./generate_Dockerfile.sh --commit c1c32938438151c7e2a22b5aa338caba2ec01da2
 ```

-A new build can last some time and may consume a lot of data.
+To update the generated Dockerfile to the commit, run:
+
+```bash
+./generate_Dockerfile.sh --commit latest
+```
+
+A new build can last some time and may consume a lot of data traffic. Note, that the latest version may result in
+a version conflict!
 More info to submodules can be found in
 [this tutorial](https://www.vogella.com/tutorials/GitSubmodules/article.html).

--- a/add-to-swarm-with-defaults.sh
+++ b/add-to-swarm-with-defaults.sh
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-cd $(cd -P -- "$(dirname -- "$0")" && pwd -P)
-
-./add-to-swarm.sh -p 8848 -n elk_datastack -r 5001
--- a/add-to-swarm.sh
+++ b/add-to-swarm.sh
@@ -1,58 +0,0 @@
-#!/usr/bin/env bash
-cd $(cd -P -- "$(dirname -- "$0")" && pwd -P)
-
-# Fetching port and network as input
-PORT=8888
-REGISTRY=5000
-while [[ "$#" -gt 0 ]]; do case $1 in
-  -p|--port) PORT="$2"; shift;;
-  -r|--registry) REGISTRY="$2"; shift;;
-  -n|--network) NETWORK="$2"; shift;;
-#  -u|--uglify) uglify=1;;
-  *) echo "Unknown parameter passed: $1"; exit 1;;
-esac; shift; done
-
-# Check if arguments are valid
-if [[ $PORT != [0-9][0-9][0-9][0-9]* ]]; then
-    echo "Given port is not valid."
-    echo "Usage: $0 -p [port] -n [docker-network] -r [registry-port] # ports must be an integer with 4 or more digits."
-    exit 21
-fi
-
-if [[ $REGISTRY != [0-9][0-9][0-9][0-9]* ]]; then
-    echo "Given registry port is not valid."
-    echo "Usage: $0 -p [port] -n [docker-network] -r [registry-port] # ports must be an integer with 4 or more digits."
-    exit 21
-fi
-
-if [[ $NETWORK == "" ]]; then
-    echo "No docker network was provided to which this gpu-jupyter should be added to."
-    echo "Usage: $0 -p [port] -n [docker-network] -r [registry-port] # ports must be an integer with 4 or more digits."
-    exit 22
-fi
-result=$(docker network ls)
-if [[ "$result" != *" $NETWORK "* ]]; then
-    echo "Could not find network $NETWORK. Please provide a valid docker network."
-    echo "Please select a network:"
-    docker network ls
-    exit 23
-fi
-
-# starting in swarm
-export HOSTNAME=$(hostname)
-export JUPYTER_PORT=$PORT
-export REGISTRY_PORT=$REGISTRY
-export JUPYTER_NETWORK=$NETWORK
-echo "Adding gpu-jupyter to the swarm on the node $HOSTNAME in the network $NETWORK on port $PORT and registry to port $REGISTRY."
-
-# substitute the blueprint docker-compose-swarm with the environment variables and stack deploy it.
-envsubst < docker-compose-swarm.yml > .docker-compose-swarm.yml.envsubst
-docker-compose -f .docker-compose-swarm.yml.envsubst build
-docker-compose -f .docker-compose-swarm.yml.envsubst push
-docker stack deploy --compose-file .docker-compose-swarm.yml.envsubst gpu
-rm .docker-compose-swarm.yml.envsubst
-
-echo
-echo "Added gpu-jupyter to docker swarm $NETWORK on port $JUPYTER_PORT."
-echo "See 'docker service ps gpu_gpu-jupyter' for status info."
-echo "See 'docker service logs -f gpu_gpu-jupyter' for logs."
--- a/docker-compose-swarm.yml
+++ b/docker-compose-swarm.yml
@@ -1,32 +0,0 @@
-version: "3.4"
-services:
-  gpu-jupyter:
-    image: 127.0.0.1:$REGISTRY_PORT/gpu-jupyter
-    build: .build
-    ports:
-      - $JUPYTER_PORT:8888
-    volumes:
-      - ./data:/home/jovyan/work
-    environment:
-      GRANT_SUDO: "yes"
-      JUPYTER_ENABLE_LAB: "yes"
-    # enable sudo permissions
-    user:
-      "root"
-    networks:
-      - default
-      - $JUPYTER_NETWORK
-    deploy:
-      placement:
-        constraints: [node.hostname == $HOSTNAME]
-      replicas: 1
-      update_config:
-        parallelism: 2
-        delay: 10s
-      restart_policy:
-        condition: on-failure
-
-networks:
-  $JUPYTER_NETWORK:
-    external:
-      name: $JUPYTER_NETWORK
--- a/extra/Getting_Started/GPU-processing.ipynb
+++ b/extra/Getting_Started/GPU-processing.ipynb
@@ -27,15 +27,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Fri Dec 20 09:42:29 2019       \n",
+      "Tue Mar 10 17:55:25 2020       \n",
      "+-----------------------------------------------------------------------------+\n",
-      "| NVIDIA-SMI 440.26       Driver Version: 440.26       CUDA Version: 10.2     |\n",
+      "| NVIDIA-SMI 440.48.02    Driver Version: 440.48.02    CUDA Version: 10.2     |\n",
      "|-------------------------------+----------------------+----------------------+\n",
      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
      "|===============================+======================+======================|\n",
      "|   0  GeForce RTX 207...  Off  | 00000000:01:00.0 Off |                  N/A |\n",
-      "|  0%   54C    P0    38W / 215W |    204MiB /  7974MiB |      0%      Default |\n",
+      "|  0%   41C    P8     1W / 215W |    215MiB /  7974MiB |      0%      Default |\n",
      "+-------------------------------+----------------------+----------------------+\n",
      "                                                                               \n",
      "+-----------------------------------------------------------------------------+\n",
@@ -80,9 +80,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n"
+     ]
+    },
    {
     "data": {
      "text/plain": [
@@ -91,41 +98,61 @@
       " memory_limit: 268435456\n",
       " locality {\n",
       " }\n",
-       " incarnation: 891330946073693377, name: \"/device:XLA_CPU:0\"\n",
+       " incarnation: 933763008911863935,\n",
+       " name: \"/device:XLA_CPU:0\"\n",
       " device_type: \"XLA_CPU\"\n",
       " memory_limit: 17179869184\n",
       " locality {\n",
       " }\n",
-       " incarnation: 9415777875944419380\n",
-       " physical_device_desc: \"device: XLA_CPU device\"]"
+       " incarnation: 12790964875098705008\n",
+       " physical_device_desc: \"device: XLA_CPU device\",\n",
+       " name: \"/device:GPU:0\"\n",
+       " device_type: \"GPU\"\n",
+       " memory_limit: 6940531098\n",
+       " locality {\n",
+       "   bus_id: 1\n",
+       "   links {\n",
+       "   }\n",
+       " }\n",
+       " incarnation: 4940791198162309705\n",
+       " physical_device_desc: \"device: 0, name: GeForce RTX 2070 SUPER, pci bus id: 0000:01:00.0, compute capability: 7.5\",\n",
+       " name: \"/device:XLA_GPU:0\"\n",
+       " device_type: \"XLA_GPU\"\n",
+       " memory_limit: 17179869184\n",
+       " locality {\n",
+       " }\n",
+       " incarnation: 6996862811697216940\n",
+       " physical_device_desc: \"device: XLA_GPU device\"]"
      ]
     },
-     "execution_count": 3,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
+    "import tensorflow as tf\n",
    "from tensorflow.python.client import device_lib\n",
+    "print(tf.test.is_gpu_available(cuda_only=True))\n",
    "device_lib.list_local_devices()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "tensor([[0.8722, 0.5115, 0.9504],\n",
-       "        [0.7723, 0.2860, 0.5793],\n",
-       "        [0.5388, 0.5681, 0.4295],\n",
-       "        [0.5269, 0.5165, 0.7475],\n",
-       "        [0.4882, 0.8255, 0.6498]])"
+       "tensor([[0.8519, 0.7682, 0.3258],\n",
+       "        [0.1957, 0.4073, 0.6085],\n",
+       "        [0.9164, 0.8401, 0.4548],\n",
+       "        [0.9011, 0.8838, 0.9559],\n",
+       "        [0.4692, 0.3993, 0.4313]])"
      ]
     },
-     "execution_count": 4,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -156,7 +183,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -165,14 +192,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "590 ms ± 41.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "362 ms ± 86.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
@@ -190,7 +217,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -199,14 +226,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "853 ms ± 16.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+      "135 ms ± 3.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
@@ -225,23 +252,36 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "tensor([[0.1054, 0.3291, 0.7729, 0.6005, 0.2372],\n",
-      "        [0.1022, 0.4534, 0.3964, 0.9174, 0.2610],\n",
-      "        [0.3969, 0.5472, 0.3876, 0.1979, 0.4063],\n",
-      "        [0.3630, 0.6374, 0.4176, 0.4804, 0.0396],\n",
-      "        [0.8256, 0.2289, 0.2265, 0.4388, 0.6070]], device='cuda:0')\n",
-      "tensor([[0.1054, 0.3291, 0.7729, 0.6005, 0.2372],\n",
-      "        [0.1022, 0.4534, 0.3964, 0.9174, 0.2610],\n",
-      "        [0.3969, 0.5472, 0.3876, 0.1979, 0.4063],\n",
-      "        [0.3630, 0.6374, 0.4176, 0.4804, 0.0396],\n",
-      "        [0.8256, 0.2289, 0.2265, 0.4388, 0.6070]], dtype=torch.float64)\n"
+      "tensor([[0.2812, 0.3255, 0.5715, 0.1665, 0.6951],\n",
+      "        [0.5562, 0.9592, 0.0911, 0.9672, 0.3311],\n",
+      "        [0.6711, 0.0422, 0.5091, 0.6653, 0.9234],\n",
+      "        [0.1029, 0.1447, 0.8385, 0.7580, 0.7998],\n",
+      "        [0.7787, 0.0114, 0.4865, 0.4171, 0.7066]], device='cuda:0')\n",
+      "tensor([[0.2812, 0.3255, 0.5715, 0.1665, 0.6951],\n",
+      "        [0.5562, 0.9592, 0.0911, 0.9672, 0.3311],\n",
+      "        [0.6711, 0.0422, 0.5091, 0.6653, 0.9234],\n",
+      "        [0.1029, 0.1447, 0.8385, 0.7580, 0.7998],\n",
+      "        [0.7787, 0.0114, 0.4865, 0.4171, 0.7066]], dtype=torch.float64)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.7/site-packages/torch/cuda/__init__.py:134: UserWarning: \n",
+      "    Found GPU0 GeForce RTX 2070 SUPER which requires CUDA_VERSION >= 10000 to\n",
+      "     work properly, but your PyTorch was compiled\n",
+      "     with CUDA_VERSION 9000. Please install the correct PyTorch binary\n",
+      "     using instructions from https://pytorch.org\n",
+      "    \n",
+      "  warnings.warn(incorrect_binary_warn % (d, name, 10000, CUDA_VERSION))\n"
     ]
    }
   ],
@@ -258,14 +298,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "11.3 ms ± 60.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "12.8 ms ± 564 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
@@ -290,7 +330,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -304,18 +344,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "tensor([[0.3112, 0.7480, 0.1882, 0.8453, 0.8198],\n",
-      "        [0.5953, 0.8401, 0.3126, 0.6025, 0.5252],\n",
-      "        [0.1902, 0.5610, 0.7968, 0.1463, 0.7154],\n",
-      "        [0.7979, 0.2161, 0.6176, 0.2951, 0.1980],\n",
-      "        [0.6451, 0.3837, 0.5305, 0.2740, 0.3330]], device='cuda:0')\n"
+      "tensor([[0.6760, 0.8890, 0.7271, 0.4208, 0.1131],\n",
+      "        [0.4036, 0.8012, 0.3448, 0.4120, 0.2439],\n",
+      "        [0.6088, 0.4356, 0.9391, 0.1366, 0.4379],\n",
+      "        [0.4540, 0.5981, 0.3885, 0.2473, 0.5938],\n",
+      "        [0.2976, 0.8384, 0.6107, 0.6882, 0.9593]], device='cuda:0')\n"
     ]
    }
   ],
@@ -327,7 +367,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -339,18 +379,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "tensor([[ 1.2748e-03,  5.3656e-04,  1.7376e-04,  3.3888e-06, -1.7049e-04],\n",
-      "        [ 5.3656e-04,  6.3624e-04,  2.5957e-05,  3.3281e-04, -1.6239e-05],\n",
-      "        [ 1.7376e-04,  2.5957e-05,  7.6328e-04,  7.7603e-05,  1.8272e-04],\n",
-      "        [ 3.3888e-06,  3.3281e-04,  7.7603e-05,  9.6281e-04,  1.2375e-04],\n",
-      "        [-1.7049e-04, -1.6239e-05,  1.8272e-04,  1.2375e-04,  6.9231e-04]],\n",
+      "tensor([[ 1.1191e-03,  1.6152e-04, -2.1592e-04,  1.4253e-04, -4.0365e-04],\n",
+      "        [ 1.6151e-04,  5.5901e-04,  2.6872e-04, -3.1842e-06,  2.8985e-04],\n",
+      "        [-2.1592e-04,  2.6872e-04,  1.0728e-03, -3.5968e-05,  5.5613e-04],\n",
+      "        [ 1.4253e-04, -3.1840e-06, -3.5968e-05,  6.5156e-04, -3.1820e-04],\n",
+      "        [-4.0365e-04,  2.8985e-04,  5.5613e-04, -3.1820e-04,  1.4067e-03]],\n",
      "       device='cuda:0')\n"
     ]
    }
@@ -362,18 +402,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "tensor([[ 1.2748e-03,  5.3656e-04,  1.7376e-04,  3.3888e-06, -1.7049e-04],\n",
-      "        [ 5.3656e-04,  6.3624e-04,  2.5957e-05,  3.3281e-04, -1.6239e-05],\n",
-      "        [ 1.7376e-04,  2.5957e-05,  7.6328e-04,  7.7603e-05,  1.8272e-04],\n",
-      "        [ 3.3888e-06,  3.3281e-04,  7.7603e-05,  9.6281e-04,  1.2375e-04],\n",
-      "        [-1.7049e-04, -1.6239e-05,  1.8272e-04,  1.2375e-04,  6.9231e-04]],\n",
+      "tensor([[ 1.1191e-03,  1.6152e-04, -2.1592e-04,  1.4253e-04, -4.0365e-04],\n",
+      "        [ 1.6151e-04,  5.5901e-04,  2.6872e-04, -3.1842e-06,  2.8985e-04],\n",
+      "        [-2.1592e-04,  2.6872e-04,  1.0728e-03, -3.5968e-05,  5.5613e-04],\n",
+      "        [ 1.4253e-04, -3.1840e-06, -3.5968e-05,  6.5156e-04, -3.1820e-04],\n",
+      "        [-4.0365e-04,  2.8985e-04,  5.5613e-04, -3.1820e-04,  1.4067e-03]],\n",
      "       dtype=torch.float64)\n"
     ]
    }
@@ -409,7 +449,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.6"
  }
 },
 "nbformat": 4,
--- a/remove-from-swarm.sh
+++ b/remove-from-swarm.sh
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-cd $(cd -P -- "$(dirname -- "$0")" && pwd -P)
-
-echo "Removing gpu-jupyter from docker swarm."
-docker stack rm gpu