This commit is contained in:
Christoph Schranz 2020-03-11 07:48:31 +01:00
commit db349d928b
7 changed files with 111 additions and 159 deletions

4
.gitignore vendored
View File

@ -114,3 +114,7 @@ venv.bak/
src/jupyter_notebook_config.json src/jupyter_notebook_config.json
.idea .idea
Deployment-notes.md Deployment-notes.md
add-to-swarm-with-defaults.sh
add-to-swarm.sh
remove-from-swarm.sh
docker-compose-swarm.yml

View File

@ -194,15 +194,22 @@ Then re-generate and re-run the image, as closer described above:
#### Update Docker-Stack #### Update Docker-Stack
The [docker-stacks](https://github.com/jupyter/docker-stacks) are used as The [docker-stacks](https://github.com/jupyter/docker-stacks) are used as a
submodule within `.build/docker-stacks`. To update the generated Dockerfile these run: submodule within `.build/docker-stacks`. Per default, the head of the commit is reset to a stable commit.
To update the generated Dockerfile to a specific commit, run:
```bash ```bash
cd .build/docker-stacks/ && git pull && cd - ./generate_Dockerfile.sh --commit c1c32938438151c7e2a22b5aa338caba2ec01da2
./generate_Dockerfile.sh
``` ```
A new build can last some time and may consume a lot of data. To update the generated Dockerfile to the commit, run:
```bash
./generate_Dockerfile.sh --commit latest
```
A new build can last some time and may consume a lot of data traffic. Note, that the latest version may result in
a version conflict!
More info to submodules can be found in More info to submodules can be found in
[this tutorial](https://www.vogella.com/tutorials/GitSubmodules/article.html). [this tutorial](https://www.vogella.com/tutorials/GitSubmodules/article.html).

View File

@ -1,4 +0,0 @@
#!/usr/bin/env bash
cd $(cd -P -- "$(dirname -- "$0")" && pwd -P)
./add-to-swarm.sh -p 8848 -n elk_datastack -r 5001

View File

@ -1,58 +0,0 @@
#!/usr/bin/env bash
cd $(cd -P -- "$(dirname -- "$0")" && pwd -P)
# Fetching port and network as input
PORT=8888
REGISTRY=5000
while [[ "$#" -gt 0 ]]; do case $1 in
-p|--port) PORT="$2"; shift;;
-r|--registry) REGISTRY="$2"; shift;;
-n|--network) NETWORK="$2"; shift;;
# -u|--uglify) uglify=1;;
*) echo "Unknown parameter passed: $1"; exit 1;;
esac; shift; done
# Check if arguments are valid
if [[ $PORT != [0-9][0-9][0-9][0-9]* ]]; then
echo "Given port is not valid."
echo "Usage: $0 -p [port] -n [docker-network] -r [registry-port] # ports must be an integer with 4 or more digits."
exit 21
fi
if [[ $REGISTRY != [0-9][0-9][0-9][0-9]* ]]; then
echo "Given registry port is not valid."
echo "Usage: $0 -p [port] -n [docker-network] -r [registry-port] # ports must be an integer with 4 or more digits."
exit 21
fi
if [[ $NETWORK == "" ]]; then
echo "No docker network was provided to which this gpu-jupyter should be added to."
echo "Usage: $0 -p [port] -n [docker-network] -r [registry-port] # ports must be an integer with 4 or more digits."
exit 22
fi
result=$(docker network ls)
if [[ "$result" != *" $NETWORK "* ]]; then
echo "Could not find network $NETWORK. Please provide a valid docker network."
echo "Please select a network:"
docker network ls
exit 23
fi
# starting in swarm
export HOSTNAME=$(hostname)
export JUPYTER_PORT=$PORT
export REGISTRY_PORT=$REGISTRY
export JUPYTER_NETWORK=$NETWORK
echo "Adding gpu-jupyter to the swarm on the node $HOSTNAME in the network $NETWORK on port $PORT and registry to port $REGISTRY."
# substitute the blueprint docker-compose-swarm with the environment variables and stack deploy it.
envsubst < docker-compose-swarm.yml > .docker-compose-swarm.yml.envsubst
docker-compose -f .docker-compose-swarm.yml.envsubst build
docker-compose -f .docker-compose-swarm.yml.envsubst push
docker stack deploy --compose-file .docker-compose-swarm.yml.envsubst gpu
rm .docker-compose-swarm.yml.envsubst
echo
echo "Added gpu-jupyter to docker swarm $NETWORK on port $JUPYTER_PORT."
echo "See 'docker service ps gpu_gpu-jupyter' for status info."
echo "See 'docker service logs -f gpu_gpu-jupyter' for logs."

View File

@ -1,32 +0,0 @@
version: "3.4"
services:
gpu-jupyter:
image: 127.0.0.1:$REGISTRY_PORT/gpu-jupyter
build: .build
ports:
- $JUPYTER_PORT:8888
volumes:
- ./data:/home/jovyan/work
environment:
GRANT_SUDO: "yes"
JUPYTER_ENABLE_LAB: "yes"
# enable sudo permissions
user:
"root"
networks:
- default
- $JUPYTER_NETWORK
deploy:
placement:
constraints: [node.hostname == $HOSTNAME]
replicas: 1
update_config:
parallelism: 2
delay: 10s
restart_policy:
condition: on-failure
networks:
$JUPYTER_NETWORK:
external:
name: $JUPYTER_NETWORK

View File

@ -27,15 +27,15 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Fri Dec 20 09:42:29 2019 \n", "Tue Mar 10 17:55:25 2020 \n",
"+-----------------------------------------------------------------------------+\n", "+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 440.26 Driver Version: 440.26 CUDA Version: 10.2 |\n", "| NVIDIA-SMI 440.48.02 Driver Version: 440.48.02 CUDA Version: 10.2 |\n",
"|-------------------------------+----------------------+----------------------+\n", "|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"|===============================+======================+======================|\n", "|===============================+======================+======================|\n",
"| 0 GeForce RTX 207... Off | 00000000:01:00.0 Off | N/A |\n", "| 0 GeForce RTX 207... Off | 00000000:01:00.0 Off | N/A |\n",
"| 0% 54C P0 38W / 215W | 204MiB / 7974MiB | 0% Default |\n", "| 0% 41C P8 1W / 215W | 215MiB / 7974MiB | 0% Default |\n",
"+-------------------------------+----------------------+----------------------+\n", "+-------------------------------+----------------------+----------------------+\n",
" \n", " \n",
"+-----------------------------------------------------------------------------+\n", "+-----------------------------------------------------------------------------+\n",
@ -80,9 +80,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n"
]
},
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
@ -91,41 +98,61 @@
" memory_limit: 268435456\n", " memory_limit: 268435456\n",
" locality {\n", " locality {\n",
" }\n", " }\n",
" incarnation: 891330946073693377, name: \"/device:XLA_CPU:0\"\n", " incarnation: 933763008911863935,\n",
" name: \"/device:XLA_CPU:0\"\n",
" device_type: \"XLA_CPU\"\n", " device_type: \"XLA_CPU\"\n",
" memory_limit: 17179869184\n", " memory_limit: 17179869184\n",
" locality {\n", " locality {\n",
" }\n", " }\n",
" incarnation: 9415777875944419380\n", " incarnation: 12790964875098705008\n",
" physical_device_desc: \"device: XLA_CPU device\"]" " physical_device_desc: \"device: XLA_CPU device\",\n",
" name: \"/device:GPU:0\"\n",
" device_type: \"GPU\"\n",
" memory_limit: 6940531098\n",
" locality {\n",
" bus_id: 1\n",
" links {\n",
" }\n",
" }\n",
" incarnation: 4940791198162309705\n",
" physical_device_desc: \"device: 0, name: GeForce RTX 2070 SUPER, pci bus id: 0000:01:00.0, compute capability: 7.5\",\n",
" name: \"/device:XLA_GPU:0\"\n",
" device_type: \"XLA_GPU\"\n",
" memory_limit: 17179869184\n",
" locality {\n",
" }\n",
" incarnation: 6996862811697216940\n",
" physical_device_desc: \"device: XLA_GPU device\"]"
] ]
}, },
"execution_count": 3, "execution_count": 7,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"import tensorflow as tf\n",
"from tensorflow.python.client import device_lib\n", "from tensorflow.python.client import device_lib\n",
"print(tf.test.is_gpu_available(cuda_only=True))\n",
"device_lib.list_local_devices()" "device_lib.list_local_devices()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"tensor([[0.8722, 0.5115, 0.9504],\n", "tensor([[0.8519, 0.7682, 0.3258],\n",
" [0.7723, 0.2860, 0.5793],\n", " [0.1957, 0.4073, 0.6085],\n",
" [0.5388, 0.5681, 0.4295],\n", " [0.9164, 0.8401, 0.4548],\n",
" [0.5269, 0.5165, 0.7475],\n", " [0.9011, 0.8838, 0.9559],\n",
" [0.4882, 0.8255, 0.6498]])" " [0.4692, 0.3993, 0.4313]])"
] ]
}, },
"execution_count": 4, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -156,7 +183,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -165,14 +192,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"590 ms ± 41.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" "362 ms ± 86.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
] ]
} }
], ],
@ -190,7 +217,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -199,14 +226,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"853 ms ± 16.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" "135 ms ± 3.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
] ]
} }
], ],
@ -225,23 +252,36 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"tensor([[0.1054, 0.3291, 0.7729, 0.6005, 0.2372],\n", "tensor([[0.2812, 0.3255, 0.5715, 0.1665, 0.6951],\n",
" [0.1022, 0.4534, 0.3964, 0.9174, 0.2610],\n", " [0.5562, 0.9592, 0.0911, 0.9672, 0.3311],\n",
" [0.3969, 0.5472, 0.3876, 0.1979, 0.4063],\n", " [0.6711, 0.0422, 0.5091, 0.6653, 0.9234],\n",
" [0.3630, 0.6374, 0.4176, 0.4804, 0.0396],\n", " [0.1029, 0.1447, 0.8385, 0.7580, 0.7998],\n",
" [0.8256, 0.2289, 0.2265, 0.4388, 0.6070]], device='cuda:0')\n", " [0.7787, 0.0114, 0.4865, 0.4171, 0.7066]], device='cuda:0')\n",
"tensor([[0.1054, 0.3291, 0.7729, 0.6005, 0.2372],\n", "tensor([[0.2812, 0.3255, 0.5715, 0.1665, 0.6951],\n",
" [0.1022, 0.4534, 0.3964, 0.9174, 0.2610],\n", " [0.5562, 0.9592, 0.0911, 0.9672, 0.3311],\n",
" [0.3969, 0.5472, 0.3876, 0.1979, 0.4063],\n", " [0.6711, 0.0422, 0.5091, 0.6653, 0.9234],\n",
" [0.3630, 0.6374, 0.4176, 0.4804, 0.0396],\n", " [0.1029, 0.1447, 0.8385, 0.7580, 0.7998],\n",
" [0.8256, 0.2289, 0.2265, 0.4388, 0.6070]], dtype=torch.float64)\n" " [0.7787, 0.0114, 0.4865, 0.4171, 0.7066]], dtype=torch.float64)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.7/site-packages/torch/cuda/__init__.py:134: UserWarning: \n",
" Found GPU0 GeForce RTX 2070 SUPER which requires CUDA_VERSION >= 10000 to\n",
" work properly, but your PyTorch was compiled\n",
" with CUDA_VERSION 9000. Please install the correct PyTorch binary\n",
" using instructions from https://pytorch.org\n",
" \n",
" warnings.warn(incorrect_binary_warn % (d, name, 10000, CUDA_VERSION))\n"
] ]
} }
], ],
@ -258,14 +298,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"11.3 ms ± 60.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" "12.8 ms ± 564 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
] ]
} }
], ],
@ -290,7 +330,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -304,18 +344,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"tensor([[0.3112, 0.7480, 0.1882, 0.8453, 0.8198],\n", "tensor([[0.6760, 0.8890, 0.7271, 0.4208, 0.1131],\n",
" [0.5953, 0.8401, 0.3126, 0.6025, 0.5252],\n", " [0.4036, 0.8012, 0.3448, 0.4120, 0.2439],\n",
" [0.1902, 0.5610, 0.7968, 0.1463, 0.7154],\n", " [0.6088, 0.4356, 0.9391, 0.1366, 0.4379],\n",
" [0.7979, 0.2161, 0.6176, 0.2951, 0.1980],\n", " [0.4540, 0.5981, 0.3885, 0.2473, 0.5938],\n",
" [0.6451, 0.3837, 0.5305, 0.2740, 0.3330]], device='cuda:0')\n" " [0.2976, 0.8384, 0.6107, 0.6882, 0.9593]], device='cuda:0')\n"
] ]
} }
], ],
@ -327,7 +367,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 17,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -339,18 +379,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 18,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"tensor([[ 1.2748e-03, 5.3656e-04, 1.7376e-04, 3.3888e-06, -1.7049e-04],\n", "tensor([[ 1.1191e-03, 1.6152e-04, -2.1592e-04, 1.4253e-04, -4.0365e-04],\n",
" [ 5.3656e-04, 6.3624e-04, 2.5957e-05, 3.3281e-04, -1.6239e-05],\n", " [ 1.6151e-04, 5.5901e-04, 2.6872e-04, -3.1842e-06, 2.8985e-04],\n",
" [ 1.7376e-04, 2.5957e-05, 7.6328e-04, 7.7603e-05, 1.8272e-04],\n", " [-2.1592e-04, 2.6872e-04, 1.0728e-03, -3.5968e-05, 5.5613e-04],\n",
" [ 3.3888e-06, 3.3281e-04, 7.7603e-05, 9.6281e-04, 1.2375e-04],\n", " [ 1.4253e-04, -3.1840e-06, -3.5968e-05, 6.5156e-04, -3.1820e-04],\n",
" [-1.7049e-04, -1.6239e-05, 1.8272e-04, 1.2375e-04, 6.9231e-04]],\n", " [-4.0365e-04, 2.8985e-04, 5.5613e-04, -3.1820e-04, 1.4067e-03]],\n",
" device='cuda:0')\n" " device='cuda:0')\n"
] ]
} }
@ -362,18 +402,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 19,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"tensor([[ 1.2748e-03, 5.3656e-04, 1.7376e-04, 3.3888e-06, -1.7049e-04],\n", "tensor([[ 1.1191e-03, 1.6152e-04, -2.1592e-04, 1.4253e-04, -4.0365e-04],\n",
" [ 5.3656e-04, 6.3624e-04, 2.5957e-05, 3.3281e-04, -1.6239e-05],\n", " [ 1.6151e-04, 5.5901e-04, 2.6872e-04, -3.1842e-06, 2.8985e-04],\n",
" [ 1.7376e-04, 2.5957e-05, 7.6328e-04, 7.7603e-05, 1.8272e-04],\n", " [-2.1592e-04, 2.6872e-04, 1.0728e-03, -3.5968e-05, 5.5613e-04],\n",
" [ 3.3888e-06, 3.3281e-04, 7.7603e-05, 9.6281e-04, 1.2375e-04],\n", " [ 1.4253e-04, -3.1840e-06, -3.5968e-05, 6.5156e-04, -3.1820e-04],\n",
" [-1.7049e-04, -1.6239e-05, 1.8272e-04, 1.2375e-04, 6.9231e-04]],\n", " [-4.0365e-04, 2.8985e-04, 5.5613e-04, -3.1820e-04, 1.4067e-03]],\n",
" dtype=torch.float64)\n" " dtype=torch.float64)\n"
] ]
} }
@ -409,7 +449,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.3" "version": "3.7.6"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -1,5 +0,0 @@
#!/usr/bin/env bash
cd $(cd -P -- "$(dirname -- "$0")" && pwd -P)
echo "Removing gpu-jupyter from docker swarm."
docker stack rm gpu