diff --git a/.gitignore b/.gitignore index f83ae5a..c511c15 100644 --- a/.gitignore +++ b/.gitignore @@ -114,3 +114,7 @@ venv.bak/ src/jupyter_notebook_config.json .idea Deployment-notes.md +add-to-swarm-with-defaults.sh +add-to-swarm.sh +remove-from-swarm.sh +docker-compose-swarm.yml diff --git a/add-to-swarm-with-defaults.sh b/add-to-swarm-with-defaults.sh deleted file mode 100644 index 766e235..0000000 --- a/add-to-swarm-with-defaults.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash -cd $(cd -P -- "$(dirname -- "$0")" && pwd -P) - -./add-to-swarm.sh -p 8848 -n elk_datastack -r 5001 diff --git a/add-to-swarm.sh b/add-to-swarm.sh deleted file mode 100755 index 7865a87..0000000 --- a/add-to-swarm.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env bash -cd $(cd -P -- "$(dirname -- "$0")" && pwd -P) - -# Fetching port and network as input -PORT=8888 -REGISTRY=5000 -while [[ "$#" -gt 0 ]]; do case $1 in - -p|--port) PORT="$2"; shift;; - -r|--registry) REGISTRY="$2"; shift;; - -n|--network) NETWORK="$2"; shift;; -# -u|--uglify) uglify=1;; - *) echo "Unknown parameter passed: $1"; exit 1;; -esac; shift; done - -# Check if arguments are valid -if [[ $PORT != [0-9][0-9][0-9][0-9]* ]]; then - echo "Given port is not valid." - echo "Usage: $0 -p [port] -n [docker-network] -r [registry-port] # ports must be an integer with 4 or more digits." - exit 21 -fi - -if [[ $REGISTRY != [0-9][0-9][0-9][0-9]* ]]; then - echo "Given registry port is not valid." - echo "Usage: $0 -p [port] -n [docker-network] -r [registry-port] # ports must be an integer with 4 or more digits." - exit 21 -fi - -if [[ $NETWORK == "" ]]; then - echo "No docker network was provided to which this gpu-jupyter should be added to." - echo "Usage: $0 -p [port] -n [docker-network] -r [registry-port] # ports must be an integer with 4 or more digits." - exit 22 -fi -result=$(docker network ls) -if [[ "$result" != *" $NETWORK "* ]]; then - echo "Could not find network $NETWORK. Please provide a valid docker network." - echo "Please select a network:" - docker network ls - exit 23 -fi - -# starting in swarm -export HOSTNAME=$(hostname) -export JUPYTER_PORT=$PORT -export REGISTRY_PORT=$REGISTRY -export JUPYTER_NETWORK=$NETWORK -echo "Adding gpu-jupyter to the swarm on the node $HOSTNAME in the network $NETWORK on port $PORT and registry to port $REGISTRY." - -# substitute the blueprint docker-compose-swarm with the environment variables and stack deploy it. -envsubst < docker-compose-swarm.yml > .docker-compose-swarm.yml.envsubst -docker-compose -f .docker-compose-swarm.yml.envsubst build -docker-compose -f .docker-compose-swarm.yml.envsubst push -docker stack deploy --compose-file .docker-compose-swarm.yml.envsubst gpu -rm .docker-compose-swarm.yml.envsubst - -echo -echo "Added gpu-jupyter to docker swarm $NETWORK on port $JUPYTER_PORT." -echo "See 'docker service ps gpu_gpu-jupyter' for status info." -echo "See 'docker service logs -f gpu_gpu-jupyter' for logs." diff --git a/docker-compose-swarm.yml b/docker-compose-swarm.yml deleted file mode 100755 index 5f14297..0000000 --- a/docker-compose-swarm.yml +++ /dev/null @@ -1,32 +0,0 @@ -version: "3.4" -services: - gpu-jupyter: - image: 127.0.0.1:$REGISTRY_PORT/gpu-jupyter - build: .build - ports: - - $JUPYTER_PORT:8888 - volumes: - - ./data:/home/jovyan/work - environment: - GRANT_SUDO: "yes" - JUPYTER_ENABLE_LAB: "yes" - # enable sudo permissions - user: - "root" - networks: - - default - - $JUPYTER_NETWORK - deploy: - placement: - constraints: [node.hostname == $HOSTNAME] - replicas: 1 - update_config: - parallelism: 2 - delay: 10s - restart_policy: - condition: on-failure - -networks: - $JUPYTER_NETWORK: - external: - name: $JUPYTER_NETWORK diff --git a/extra/Getting_Started/GPU-processing.ipynb b/extra/Getting_Started/GPU-processing.ipynb index fae5cde..8741880 100644 --- a/extra/Getting_Started/GPU-processing.ipynb +++ b/extra/Getting_Started/GPU-processing.ipynb @@ -27,15 +27,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Fri Dec 20 09:42:29 2019 \n", + "Tue Mar 10 17:55:25 2020 \n", "+-----------------------------------------------------------------------------+\n", - "| NVIDIA-SMI 440.26 Driver Version: 440.26 CUDA Version: 10.2 |\n", + "| NVIDIA-SMI 440.48.02 Driver Version: 440.48.02 CUDA Version: 10.2 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "|===============================+======================+======================|\n", "| 0 GeForce RTX 207... Off | 00000000:01:00.0 Off | N/A |\n", - "| 0% 54C P0 38W / 215W | 204MiB / 7974MiB | 0% Default |\n", + "| 0% 41C P8 1W / 215W | 215MiB / 7974MiB | 0% Default |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", @@ -80,9 +80,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + }, { "data": { "text/plain": [ @@ -91,41 +98,61 @@ " memory_limit: 268435456\n", " locality {\n", " }\n", - " incarnation: 891330946073693377, name: \"/device:XLA_CPU:0\"\n", + " incarnation: 933763008911863935,\n", + " name: \"/device:XLA_CPU:0\"\n", " device_type: \"XLA_CPU\"\n", " memory_limit: 17179869184\n", " locality {\n", " }\n", - " incarnation: 9415777875944419380\n", - " physical_device_desc: \"device: XLA_CPU device\"]" + " incarnation: 12790964875098705008\n", + " physical_device_desc: \"device: XLA_CPU device\",\n", + " name: \"/device:GPU:0\"\n", + " device_type: \"GPU\"\n", + " memory_limit: 6940531098\n", + " locality {\n", + " bus_id: 1\n", + " links {\n", + " }\n", + " }\n", + " incarnation: 4940791198162309705\n", + " physical_device_desc: \"device: 0, name: GeForce RTX 2070 SUPER, pci bus id: 0000:01:00.0, compute capability: 7.5\",\n", + " name: \"/device:XLA_GPU:0\"\n", + " device_type: \"XLA_GPU\"\n", + " memory_limit: 17179869184\n", + " locality {\n", + " }\n", + " incarnation: 6996862811697216940\n", + " physical_device_desc: \"device: XLA_GPU device\"]" ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "import tensorflow as tf\n", "from tensorflow.python.client import device_lib\n", + "print(tf.test.is_gpu_available(cuda_only=True))\n", "device_lib.list_local_devices()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "tensor([[0.8722, 0.5115, 0.9504],\n", - " [0.7723, 0.2860, 0.5793],\n", - " [0.5388, 0.5681, 0.4295],\n", - " [0.5269, 0.5165, 0.7475],\n", - " [0.4882, 0.8255, 0.6498]])" + "tensor([[0.8519, 0.7682, 0.3258],\n", + " [0.1957, 0.4073, 0.6085],\n", + " [0.9164, 0.8401, 0.4548],\n", + " [0.9011, 0.8838, 0.9559],\n", + " [0.4692, 0.3993, 0.4313]])" ] }, - "execution_count": 4, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -156,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -165,14 +192,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "590 ms ± 41.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "362 ms ± 86.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -190,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -199,14 +226,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "853 ms ± 16.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "135 ms ± 3.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -225,23 +252,36 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "tensor([[0.1054, 0.3291, 0.7729, 0.6005, 0.2372],\n", - " [0.1022, 0.4534, 0.3964, 0.9174, 0.2610],\n", - " [0.3969, 0.5472, 0.3876, 0.1979, 0.4063],\n", - " [0.3630, 0.6374, 0.4176, 0.4804, 0.0396],\n", - " [0.8256, 0.2289, 0.2265, 0.4388, 0.6070]], device='cuda:0')\n", - "tensor([[0.1054, 0.3291, 0.7729, 0.6005, 0.2372],\n", - " [0.1022, 0.4534, 0.3964, 0.9174, 0.2610],\n", - " [0.3969, 0.5472, 0.3876, 0.1979, 0.4063],\n", - " [0.3630, 0.6374, 0.4176, 0.4804, 0.0396],\n", - " [0.8256, 0.2289, 0.2265, 0.4388, 0.6070]], dtype=torch.float64)\n" + "tensor([[0.2812, 0.3255, 0.5715, 0.1665, 0.6951],\n", + " [0.5562, 0.9592, 0.0911, 0.9672, 0.3311],\n", + " [0.6711, 0.0422, 0.5091, 0.6653, 0.9234],\n", + " [0.1029, 0.1447, 0.8385, 0.7580, 0.7998],\n", + " [0.7787, 0.0114, 0.4865, 0.4171, 0.7066]], device='cuda:0')\n", + "tensor([[0.2812, 0.3255, 0.5715, 0.1665, 0.6951],\n", + " [0.5562, 0.9592, 0.0911, 0.9672, 0.3311],\n", + " [0.6711, 0.0422, 0.5091, 0.6653, 0.9234],\n", + " [0.1029, 0.1447, 0.8385, 0.7580, 0.7998],\n", + " [0.7787, 0.0114, 0.4865, 0.4171, 0.7066]], dtype=torch.float64)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.7/site-packages/torch/cuda/__init__.py:134: UserWarning: \n", + " Found GPU0 GeForce RTX 2070 SUPER which requires CUDA_VERSION >= 10000 to\n", + " work properly, but your PyTorch was compiled\n", + " with CUDA_VERSION 9000. Please install the correct PyTorch binary\n", + " using instructions from https://pytorch.org\n", + " \n", + " warnings.warn(incorrect_binary_warn % (d, name, 10000, CUDA_VERSION))\n" ] } ], @@ -258,14 +298,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "11.3 ms ± 60.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "12.8 ms ± 564 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -290,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -304,18 +344,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "tensor([[0.3112, 0.7480, 0.1882, 0.8453, 0.8198],\n", - " [0.5953, 0.8401, 0.3126, 0.6025, 0.5252],\n", - " [0.1902, 0.5610, 0.7968, 0.1463, 0.7154],\n", - " [0.7979, 0.2161, 0.6176, 0.2951, 0.1980],\n", - " [0.6451, 0.3837, 0.5305, 0.2740, 0.3330]], device='cuda:0')\n" + "tensor([[0.6760, 0.8890, 0.7271, 0.4208, 0.1131],\n", + " [0.4036, 0.8012, 0.3448, 0.4120, 0.2439],\n", + " [0.6088, 0.4356, 0.9391, 0.1366, 0.4379],\n", + " [0.4540, 0.5981, 0.3885, 0.2473, 0.5938],\n", + " [0.2976, 0.8384, 0.6107, 0.6882, 0.9593]], device='cuda:0')\n" ] } ], @@ -327,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -339,18 +379,18 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "tensor([[ 1.2748e-03, 5.3656e-04, 1.7376e-04, 3.3888e-06, -1.7049e-04],\n", - " [ 5.3656e-04, 6.3624e-04, 2.5957e-05, 3.3281e-04, -1.6239e-05],\n", - " [ 1.7376e-04, 2.5957e-05, 7.6328e-04, 7.7603e-05, 1.8272e-04],\n", - " [ 3.3888e-06, 3.3281e-04, 7.7603e-05, 9.6281e-04, 1.2375e-04],\n", - " [-1.7049e-04, -1.6239e-05, 1.8272e-04, 1.2375e-04, 6.9231e-04]],\n", + "tensor([[ 1.1191e-03, 1.6152e-04, -2.1592e-04, 1.4253e-04, -4.0365e-04],\n", + " [ 1.6151e-04, 5.5901e-04, 2.6872e-04, -3.1842e-06, 2.8985e-04],\n", + " [-2.1592e-04, 2.6872e-04, 1.0728e-03, -3.5968e-05, 5.5613e-04],\n", + " [ 1.4253e-04, -3.1840e-06, -3.5968e-05, 6.5156e-04, -3.1820e-04],\n", + " [-4.0365e-04, 2.8985e-04, 5.5613e-04, -3.1820e-04, 1.4067e-03]],\n", " device='cuda:0')\n" ] } @@ -362,18 +402,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "tensor([[ 1.2748e-03, 5.3656e-04, 1.7376e-04, 3.3888e-06, -1.7049e-04],\n", - " [ 5.3656e-04, 6.3624e-04, 2.5957e-05, 3.3281e-04, -1.6239e-05],\n", - " [ 1.7376e-04, 2.5957e-05, 7.6328e-04, 7.7603e-05, 1.8272e-04],\n", - " [ 3.3888e-06, 3.3281e-04, 7.7603e-05, 9.6281e-04, 1.2375e-04],\n", - " [-1.7049e-04, -1.6239e-05, 1.8272e-04, 1.2375e-04, 6.9231e-04]],\n", + "tensor([[ 1.1191e-03, 1.6152e-04, -2.1592e-04, 1.4253e-04, -4.0365e-04],\n", + " [ 1.6151e-04, 5.5901e-04, 2.6872e-04, -3.1842e-06, 2.8985e-04],\n", + " [-2.1592e-04, 2.6872e-04, 1.0728e-03, -3.5968e-05, 5.5613e-04],\n", + " [ 1.4253e-04, -3.1840e-06, -3.5968e-05, 6.5156e-04, -3.1820e-04],\n", + " [-4.0365e-04, 2.8985e-04, 5.5613e-04, -3.1820e-04, 1.4067e-03]],\n", " dtype=torch.float64)\n" ] } @@ -409,7 +449,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.6" } }, "nbformat": 4, diff --git a/remove-from-swarm.sh b/remove-from-swarm.sh deleted file mode 100755 index 28671fd..0000000 --- a/remove-from-swarm.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash -cd $(cd -P -- "$(dirname -- "$0")" && pwd -P) - -echo "Removing gpu-jupyter from docker swarm." -docker stack rm gpu