Actions
Bug #8288
closedarv-mount / crunchstat in a crunch job fails to exit because reasons
Start date:
01/23/2016
Due date:
% Done:
100%
Estimated time:
(Total: 0.00 h)
Story points:
0.5
Description
srun / arv-mount / crunchstat doesn't exit because apparently although the docker container has exited and `docker run` has returned, the docker daemon still has an open file on the keep mount that `arv-mount` created.
# ps auxwww | awk '$1=="crunch"' crunch 8848 0.0 0.0 608616 31096 ? Sl Jan22 0:01 /usr/bin/python2.7 /usr/local/bin/arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep --exec crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=/data/crunch-tmp/crunch-job/z8ta6-ot0gb-v0p7tcarqimrnmj-0.cid -poll=10000 /usr/bin/docker run --name=z8ta6-ot0gb-v0p7tcarqimrnmj-0 --attach=stdout --attach=stderr --attach=stdin -i --cidfile=/data/crunch-tmp/crunch-job/z8ta6-ot0gb-v0p7tcarqimrnmj-0.cid --sig-proxy --memory=188032493k --memory-swap=190129641k --volume=/data/crunch-tmp/crunch-job/src:/data/crunch-tmp/crunch-job/src:ro --volume=/data/crunch-tmp/crunch-job/opt:/data/crunch-tmp/crunch-job/opt:ro --volume=/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep/by_pdh:/keep:ro --volume=/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep/tmp:/keep_tmp --volume=/tmp --env=TASK_SEQUENCE=0 --env=TASK_KEEPMOUNT=/keep --env=JOB_PARAMETER_INPUTS_COLLECTION=e8ccf6ac8348615a9873149f4ff07353+14726 --env=CRUNCH_SRC_COMMIT=15ee3e583ff9eeb273b52826c8944b43ae21e8bd --env=TASK_QSEQUENCE=0 --env=CRUNCH_INSTALL=/data/crunch-tmp/crunch-job/opt --env=CRUNCH_GIT_ARCHIVE_HASH=085e92f18ba93c6b8eb2507dbfbdf929 --env=CRUNCH_REFRESH_TRIGGER=/tmp/crunch_refresh_trigger --env=ARVADOS_API_TOKEN=2xb1o74nhxb7h2k2fculw4k6xahvss7v1ux8lhmd52conglznk --env=JOB_PARAMETER_SKIP_SQ_SN_REGEX=_decoy$ --env=CRUNCH_WORK=/data/crunch-tmp/crunch-job/work --env=CRUNCH_TMP=/data/crunch-tmp/crunch-job --env=TASK_TMPDIR=/tmp/crunch-job-task-work/humgen-03-02.1 --env=JOB_UUID=z8ta6-8i9sb-fsm22dece9hd043 --env=CRUNCH_JOB_UUID=z8ta6-8i9sb-fsm22dece9hd043 --env=TASK_SLOT_NUMBER=1 --env=CRUNCH_SRC_URL=/var/lib/arvados/internal.git --env=TASK_SLOT_NODE=humgen-03-02 --env=JOB_SCRIPT=bcftools-exp-gvcf-mpileup-cram-chunked.py --env=CRUNCH_NODE_SLOTS=1 --env=JOB_PARAMETER_REFERENCE_COLLECTION=a83bd4e5a26a64612322f21515d93bab+6190 --env=JOB_PARAMETER_GENOME_CHUNKS=400 --env=CRUNCH_JOB_DOCKER_BIN=/usr/bin/docker --env=TASK_WORK=/tmp/crunch-job-task-work/humgen-03-02.1 --env=TASK_KEEPMOUNT_TMP=/keep_tmp --env=ARVADOS_API_HOST=api.arvados.sanger.ac.uk --env=JOB_WORK=/tmp/crunch-job-work --env=TASK_UUID=z8ta6-ot0gb-v0p7tcarqimrnmj --env=CRUNCH_SRC=/data/crunch-tmp/crunch-job/src --env=HOME=/tmp/crunch-job-task-work/humgen-03-02.1 bfdf5bfbb2858302a6b82242018f6d39aade1707e25541710ff85b804c35056c /bin/sh -c python -c "from pkg_resources import get_distribution as get; print \"Using Arvados SDK version\", get(\"arvados-python-client\").version">&2 2>/dev/null; mkdir -p "/tmp/crunch-job-work" "/tmp/crunch-job-task-work/humgen-03-02.1" && if which stdbuf >/dev/null ; then exec stdbuf --output=0 --error=0 \/data\/crunch\-tmp\/crunch\-job\/src\/crunch_scripts\/bcftools\-exp\-gvcf\-mpileup\-cram\-chunked\.py ; else exec \/data\/crunch\-tmp\/crunch\-job\/src\/crunch_scripts\/bcftools\-exp\-gvcf\-mpileup\-cram\-chunked\.py ; fi # pstree -Aap 8848 arv-mount,8848 /usr/local/bin/arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep--e |-{arv-mount},8860 `-{arv-mount},8863 # fuser -v -m /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep USER PID ACCESS COMMAND /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep: root kernel mount /data crunch 8848 ..c.. arv-mount root 37715 F.... docker # ps auxwww | grep 37715 | grep -v grep root 37715 0.1 0.0 12555452 94712 ? Ssl 2015 166:27 /usr/bin/docker -d --graph=/data/docker # docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES # docker version Client version: 1.7.1 Client API version: 1.19 Go version (client): go1.4.2 Git commit (client): 786b29d OS/Arch (client): linux/amd64 Server version: 1.7.1 Server API version: 1.19 Go version (server): go1.4.2 Git commit (server): 786b29d OS/Arch (server): linux/amd64 # lsof -p 37715 COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME docker 37715 root cwd DIR 8,6 4096 2 / docker 37715 root rtd DIR 8,6 4096 2 / docker 37715 root txt REG 8,6 16296881 1977972 /usr/bin/docker docker 37715 root 0u CHR 1,3 0t0 1029 /dev/null docker 37715 root 1u CHR 136,2 0t0 5 /dev/pts/2 docker 37715 root 2u CHR 136,2 0t0 5 /dev/pts/2 docker 37715 root 3u unix 0xffff881881334280 0t0 48220470 /var/run/docker.sock docker 37715 root 4u unix 0xffff882ff7a0f700 0t0 1501908 /var/run/docker.sock docker 37715 root 5r CHR 1,9 0t0 1034 /dev/urandom docker 37715 root 6u 0000 0,9 0 6847 anon_inode docker 37715 root 9u unix 0xffff881ff0922300 0t0 48310485 /var/run/docker.sock docker 37715 root 11u unix 0xffff881ff0927a80 0t0 48326780 /var/run/docker.sock docker 37715 root 12u unix 0xffff88191467b800 0t0 48698892 /var/run/docker.sock docker 37715 root 14u unix 0xffff880106038000 0t0 48603069 /var/run/docker.sock docker 37715 root 16u unix 0xffff882005fa6c80 0t0 48334610 /var/run/docker.sock docker 37715 root 17u REG 252,0 1996800 366018572 /data/docker/linkgraph.db docker 37715 root 19u REG 252,0 1996800 366018572 /data/docker/linkgraph.db docker 37715 root 23u unix 0xffff881ff0922d80 0t0 48247780 /var/run/docker.sock docker 37715 root 24u unix 0xffff88191467e580 0t0 48663201 /var/run/docker.sock docker 37715 root 25u unix 0xffff88010603c980 0t0 48683402 /var/run/docker.sock docker 37715 root 26u unix 0xffff881ef218b100 0t0 48310488 /var/run/docker.sock docker 37715 root 27u unix 0xffff882005fa6580 0t0 48309653 /var/run/docker.sock # stat /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep File: `/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep' Size: 4096 Blocks: 8 IO Block: 4096 directory Device: fc00h/64512d Inode: 262078526 Links: 2 Access: (0755/drwxr-xr-x) Uid: (15324/ crunch) Gid: ( 1593/ arvados) Access: 2016-01-22 21:49:38.729986521 +0000 Modify: 2016-01-22 21:49:38.441989497 +0000 Change: 2016-01-22 21:49:38.441989497 +0000 Birth: - # stat -f /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep File: "/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep" ID: 6fef924393f3e2f8 Namelen: 255 Type: ext2/ext3 Block size: 4096 Fundamental block size: 4096 Blocks: Total: 17485339344 Free: 16349728046 Available: 16174171418 Inodes: Total: 1097203712 Free: 1096592099 # uname -a Linux humgen-03-02 3.13.0-63-generic #104~precise1-Ubuntu SMP Tue Aug 18 17:03:03 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux # readlink /proc/37715/fd/6 anon_inode:[eventpoll] # fusermount -u /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep fusermount: failed to unmount /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep: Invalid argument # mount -t fuse # umount /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep umount: /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep: not mounted # rm -rf /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep # pstree -Aasp 8848 init,1 `-slurmstepd,8845 `-arv-mount,8848 /usr/local/bin/arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other ... |-{arv-mount},8860 `-{arv-mount},8863 # strace -f -p 8848 Process 8848 attached with 3 threads - interrupt to quit [pid 8863] futex(0x7fcc728c7a84, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...> [pid 8860] select(0, NULL, NULL, NULL, {1, 754286} <unfinished ...> [pid 8848] futex(0x32c6e80, FUTEX_WAIT_PRIVATE, 0, NULL <unfinished ...> [pid 8860] <... select resumed> ) = 0 (Timeout) [pid 8860] write(2, "crunchstat: keepcalls 0 put 0 ge"..., 74) = 74 [pid 8860] write(2, "crunchstat: net:keep0 0 tx 0 rx "..., 70) = 70 [pid 8860] write(2, "crunchstat: keepcache 0 hit 0 mi"..., 76) = 76 [pid 8860] write(2, "crunchstat: fuseops 0 write 0 re"..., 78) = 78 [pid 8860] write(2, "crunchstat: blkio:0:0 0 write 0 "..., 80) = 80 [pid 8860] select(0, NULL, NULL, NULL, {10, 0}) = 0 (Timeout) [pid 8860] write(2, "crunchstat: keepcalls 0 put 0 ge"..., 74) = 74 [pid 8860] write(2, "crunchstat: net:keep0 0 tx 0 rx "..., 70) = 70 [pid 8860] write(2, "crunchstat: keepcache 0 hit 0 mi"..., 76) = 76 [pid 8860] write(2, "crunchstat: fuseops 0 write 0 re"..., 78) = 78 [pid 8860] write(2, "crunchstat: blkio:0:0 0 write 0 "..., 80) = 80 [pid 8860] select(0, NULL, NULL, NULL, {10, 0}^C <unfinished ...> Process 8848 detached Process 8860 detached Process 8863 detached # kill 8860
Actions