...
Code Block | ||
---|---|---|
| ||
# After DAOS servers and DAOS admin and client RPMs loaded $ dmg storage format Format Summary: Hosts SCM Devices NVMe Devices ----- ----------- ------------ boro-[8,35,52-53] 1 0 $ dmg pool list Pool UUID Svc Replicas --------- ------------ 733bee7b-c2af-499e-99dd-313b1ef092a9 [1-3] $ daos cont create --pool=$DAOS_POOL --type=POSIX --oclass=RP_3G1 --properties=rf:2 Successfully created container 2649aa0f-3ad7-4943-abf5-4343205a637b $ daos pool list-cont --pool=$DAOS_POOL 2649aa0f-3ad7-4943-abf5-4343205a637b $ dmg pool query --pool=$DAOS_POOL Pool 733bee7b-c2af-499e-99dd-313b1ef092a9, ntarget=32, disabled=0, leader=2, version=1 Pool space info: - Target(VOS) count:32 - SCM: Total size: 5.0 GB Free: 5.0 GB, min:156 MB, max:156 MB, mean:156 MB - NVMe: Total size: 0 B Free: 0 B, min:0 B, max:0 B, mean:0 B Rebuild idle, 0 objs, 0 recs $ df -h -t fuse.daos df: no file systems processed $ mkdir /tmp/daos_test1 $ dfuse --mmountpoint=/tmp/daos_test1 --pool=70f73efc-848e-4f6e-b4fd-909bcf9bd427$DAOS_POOL --cont=cf2a95ce-9910-4d5e-814c-cafb0a7f0944$DAOS_CONT $ df -h -t fuse.daos Filesystem Size Used Avail Use% Mounted on dfuse 19G 1.1M 19G 1% /tmp/daos_test1 $ fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --directory=/tmp/daos_test1 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting random-write: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=pvsync, iodepth=16 ... fio-3.7 Starting 8 processes random-write: Laying out IO files (4 files / total 128MiB) random-write: Laying out IO files (4 files / total 128MiB) random-write: Laying out IO files (4 files / total 128MiB) random-write: Laying out IO files (4 files / total 128MiB) random-write: Laying out IO files (4 files / total 128MiB) random-write: Laying out IO files (4 files / total 128MiB) random-write: Laying out IO files (4 files / total 128MiB) random-write: Laying out IO files (4 files / total 128MiB) Jobs: 8 (f=32): [w(8)][100.0%][r=0KiB/s,w=96.1MiB/s][r=0,w=24.6k IOPS][eta 00m:00s] random-write: (groupid=0, jobs=8): err= 0: pid=27879: Sat Apr 17 01:12:57 2021 write: IOPS=24.4k, BW=95.3MiB/s (99.9MB/s)(5716MiB/60001msec) clat (usec): min=220, max=6687, avg=326.19, stdev=55.29 lat (usec): min=220, max=6687, avg=326.28, stdev=55.29 clat percentiles (usec): | 1.00th=[ 260], 5.00th=[ 273], 10.00th=[ 285], 20.00th=[ 293], | 30.00th=[ 306], 40.00th=[ 314], 50.00th=[ 322], 60.00th=[ 330], | 70.00th=[ 338], 80.00th=[ 355], 90.00th=[ 375], 95.00th=[ 396], | 99.00th=[ 445], 99.50th=[ 465], 99.90th=[ 523], 99.95th=[ 562], | 99.99th=[ 1827] bw ( KiB/s): min=10976, max=12496, per=12.50%, avg=12191.82, stdev=157.87, samples=952 iops : min= 2744, max= 3124, avg=3047.92, stdev=39.47, samples=952 lat (usec) : 250=0.23%, 500=99.61%, 750=0.15%, 1000=0.01% lat (msec) : 2=0.01%, 4=0.01%, 10=0.01% cpu : usr=0.81%, sys=1.69%, ctx=1463535, majf=0, minf=308 IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=0,1463226,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=16 Run status group 0 (all jobs): WRITE: bw=95.3MiB/s (99.9MB/s), 95.3MiB/s-95.3MiB/s (99.9MB/s-99.9MB/s), io=5716MiB (5993MB), run=60001-60001msec |
...
Run dfuse with
...
rebuild
Code Block | ||
---|---|---|
| ||
# Start dfuse $ fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --directory=/tmp/daos_test1 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting |
...
No Format |
---|
random-write: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=pvsync, iodepth=16 ... fio-3.7 Starting 8 processes fio: io_u error on file /tmp/daos_test1/random-write.2.1: Input/output error: write offset=8527872, buflen=4096 fio: pid=28242, err=5 file:io_u.c:1747 bw ( KiB/s): min= 3272, max=12384, per=30.14%, avg=11624.50, stdev=2181.01, samples=128 iops : min= 818, max= 3096, avg=2906.12, stdev=545.25, samples=128 lat (usec) : 250=0.23%, 500=99.59%, 750=0.12%, 1000=0.01% lat (msec) : 2=0.03%, 4=0.02% cpu : usr=0.27%, sys=0.66%, ctx=186210, majf=0, minf=494 IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.1%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=0,186000,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=16 Run status group 0 (all jobs): WRITE: bw=37.7MiB/s (39.5MB/s), 37.7MiB/s-37.7MiB/s (39.5MB/s-39.5MB/s), io=727MiB (762MB), run=19291-19291msec ... |
Code Block | ||
---|---|---|
|
...
# from daos_admin console, stop leader-rank with debug $ dmg -d system stop --ranks=3 |
...
DEBUG 01:34:58.026753 main.go:217: debug output enabled |
...
DEBUG 01:34:58.027457 main.go:244: control config loaded from /etc/daos/daos_control.yml |
...
Rank Operation Result |
...
--------- ------ |
...
3 stop OK |
...
$ daos pool list-cont --pool=$DAOS_POOL |
...
No Format |
---|
cf2a95ce-9910-4d5e-814c-cafb0a7f0944 $ dmg pool query --pool=733bee7b-c2af-499e-99dd-313b1ef092a9$DAOS_POOL Pool 733bee7b70f73efc-c2af848e-499e4f6e-99ddb4fd-313b1ef092a9909bcf9bd427, ntarget=32, disabled=08, leader=2, version=118 Pool space info: - Target(VOS) count:3224 - SCM: Total size: 5.015 GB Free: 5.014 GB, min:156575 MB, max:156597 MB, mean:156587 MB - NVMe: Total size: 0 B Free: 0 B, min:0 B, max:0 B, mean:0 B Rebuild idledone, 01 objs, 057 recs |
$ dmg pool list
Pool UUID Svc Replicas
...
# Verify stopped server been evicted $ dmg system query -v Rank UUID Control Address Fault Domain State Reason ---- --------------- ----- |
...
70f73efc-848e-4f6e-b4fd-909bcf9bd427 [1-2]
$ daos pool list-cont --pool=$DAOS_POOL
cf2a95ce-9910-4d5e-814c-cafb0a7f0944
$ dmg pool query --pool=$DAOS_POOL
Pool 70f73efc-848e-4f6e-b4fd-909bcf9bd427,
ntarget=32,
disabled=8,
leader=2,
version=18
Pool space info:
Target(VOS) count:24
SCM:
Total size: 15 GB
Free: 14 GB, min:575 MB, max:597 MB, mean:587 MB
NVMe:
Total size: 0 B
Free: 0 B, min:0 B, max:0 B, mean:0 B
Rebuild done, 1 objs, 57 recs
...
------- ----- ------ 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 boro-8.boro.hpdd.intel.com Joined 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1. |
...
35:10001 boro-35.boro.hpdd.intel.com Joined |
...
2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 boro-53.boro.hpdd.intel.com Joined |
...
3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 boro-52.boro.hpdd.intel.com Evicted system stop |
Code Block | ||
---|---|---|
| ||
# Restart, after evicted server restarted, verify the server joined $ /usr/bin/dmg system query -v |
...
Rank UUID Control Address Fault Domain State Reason |
...
---- |
...
------- |
...
$ df -h -t fuse.daos
Filesystem Size Used Avail Use% Mounted on
dfuse 14G 867M 14G 7% /tmp/daos_test1
$ fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --directory=/tmp/daos_test1 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting
random-write: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=pvsync, iodepth=16
...
fio-3.7
Starting 8 processes file:filesetup.c:349, func=fstat, error=Input/output error
Run status group 0 (all jobs):
...
- |
...
$ dmg pool list
Pool UUID Svc Replicas
------- ----- |
...
- |
...
- |
...
- |
...
- |
...
- |
...
- |
...
- - |
...
- |
...
- |
...
- |
...
- |
...
- |
...
-- |
...
-- |
...
$ dmg -o /etc/daos/daos_control.yml -d system stop --ranks=2
No Format |
---|
DEBUG 02:01:54.916742 main.go:217: debug output enabled DEBUG 02:01:54.917508 main.go:244: control config loaded from /etc/daos/daos_control.yml DEBUG 02:01:54.920913 system.go:568: DAOS system stop request: &{unaryRequest:{request:{deadline:{wall:0 ext:0 loc:<nil>} Sys: HostList:[]} rpc:0xd1cf60} msRequest:{} sysRequest:{Ranks:{RWMutex:{w:{state:0 sema:0} writerSem:0 readerSem:0 readerCount:0 readerWait:0} HostSet:{Mutex:{state:0 sema:0} list:0xc00029d340}} Hosts:{Mutex:{state:0 sema:0} list:0xc00029d300}} Prep:true Kill:true Force:false} DEBUG 02:01:54.921844 rpc.go:196: request hosts: [boro-8:10001 boro-35:10001] Rank Operation Result - 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 /boro-8.boro.hpdd.intel.com Joined 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 /boro-35.boro.hpdd.intel.com Joined 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 /boro-53.boro.hpdd.intel.com Joined 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 /boro-52.boro.hpdd.intel.com Joined # Unmount after test completed $ fusermount -u /tmp/daos_test1/ $ df -h -t fuse.daos df: no file systems processed |
Run mpirun mdtest with rebuild
Code Block | ||
---|---|---|
| ||
$ dmg pool create --size=50G Creating DAOS pool with automatic storage allocation: 50 GB NVMe + 6.00% SCM Pool created with 100.00% SCM/NVMe ratio ----------------------------------------- UUID : 4eda8a8c-028c-461c-afd3---- 2 stop OK |
$ dmg pool create --size=50G
No Format |
---|
Creating DAOS pool with automatic storage allocation704534961572 Service Ranks : [1-3] Storage Ranks : [0-3] Total Size : 50 GB SCM : 50 GB (12 GB / rank) NVMe + 6.00% SCM Pool created with 100.00% SCM/NVMe ratio ----------------------------------------- UUID : 4eda8a8c-028c-461c-afd3-704534961572 Service Ranks : [1-3] Storage Ranks : [0-3] Total Size : 50 GB SCM : 50 GB (12 GB / rank) NVMe : 0 B (0 B / rank) |
$ daos cont create --pool=$DAOS_POOL --type=POSIX --oclass=RP_3G1 --properties=rf:2
Successfully created container d71ff6a5-15a5-43fe-b829-bef9c65b9ccb
Run mpirun mdtest with rebuild
$ /usr/lib64/mpich/bin/mpirun -host boro-8 -np 30 mdtest -a DFS -z 0 -F -C -i 100 -n 1667 -e 4096 -d / -w 4096 --dfs.chunk_size 1048576 --dfs.cont $DAOS_CONT --dfs.destroy --dfs.dir_oclass RP_3G1 --dfs.group daos_server --dfs.oclass RP_3G1 --dfs.pool $DAOS_POOL
started at 04/22/2021 17:46:20 –
mdtest-3.4.0+dev was launched with 30 total task(s) on 1 node(s)
Command line used: mdtest 'a' 'DFS' '-z' '0' '-F' '-C' '-i' '100' '-n' '1667' '-e' '4096' '-d' '/' '-w' '4096' 'dfs.chunk_size' '1048576' 'dfs.cont' 'd71ff6a5-15a5-43fe-b829-bef9c65b9ccb' 'dfs.destroy' 'dfs.dir_oclass' 'RP_3G1' 'dfs.group' 'daos_server' 'dfs.oclass' 'RP_3G1' '-dfs.pool' '4eda8a8c-028c-461c-afd3-704534961572'
WARNING: unable to use realpath() on file system.
Path:
FS: 0.0 GiB Used FS: -nan% Inodes: 0.0 Mi Used Inodes: -nan%
Nodemap: 111111111111111111111111111111
30 tasks, 50010 files
$ dmg system stop --ranks=3
...
: 0 B (0 B / rank)
$ daos cont create --pool=$DAOS_POOL --type=POSIX --oclass=RP_3G1 --properties=rf:2
Successfully created container d71ff6a5-15a5-43fe-b829-bef9c65b9ccb
$ /usr/lib64/mpich/bin/mpirun -host boro-8 -np 30 mdtest -a DFS -z 0 -F -C -i 100 -n 1667 -e 4096 -d / -w 4096 --dfs.chunk_size 1048576 --dfs.cont $DAOS_CONT --dfs.destroy --dfs.dir_oclass RP_3G1 --dfs.group daos_server --dfs.oclass RP_3G1 --dfs.pool $DAOS_POOL
started at 04/22/2021 17:46:20 –
mdtest-3.4.0+dev was launched with 30 total task(s) on 1 node(s)
Command line used: mdtest 'a' 'DFS' '-z' '0' '-F' '-C' '-i' '100' '-n' '1667' '-e' '4096' '-d' '/' '-w' '4096' 'dfs.chunk_size' '1048576' 'dfs.cont' 'd71ff6a5-15a5-43fe-b829-bef9c65b9ccb' 'dfs.destroy' 'dfs.dir_oclass' 'RP_3G1' 'dfs.group' 'daos_server' 'dfs.oclass' 'RP_3G1' '-dfs.pool' '4eda8a8c-028c-461c-afd3-704534961572'
WARNING: unable to use realpath() on file system.
Path:
FS: 0.0 GiB Used FS: -nan% Inodes: 0.0 Mi Used Inodes: -nan%
Nodemap: 111111111111111111111111111111
30 tasks, 50010 files
... |
Code Block | ||
---|---|---|
| ||
# from daos_admin console, stop a server rank
$ dmg system stop --ranks=2
Rank Operation Result
--------- ------
2 stop OK
# Verify stopped server been evicted
$ dmg system query -v
Rank UUID Control Address Fault Domain State Reason
---- --------------- ------------ ----- ------
0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 boro-8.boro.hpdd.intel.com Joined
1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 boro-35.boro.hpdd.intel.com Joined
2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 boro-53.boro.hpdd.intel.com Evicted system stop
3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 boro-52.boro.hpdd.intel.com Joined |
No Format |
---|
# Restart, after evicted server restarted, verify the server joined
$ /usr/bin/dmg system query -v
Rank UUID Control Address Fault Domain State Reason
---- --------------- ------------ ----- ------
0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 /boro-8.boro.hpdd.intel.com Joined
1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 /boro-35.boro.hpdd.intel.com Joined
2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 /boro-53.boro.hpdd.intel.com Joined
3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 /boro-52.boro.hpdd.intel.com Joined |
Clean-Up
No Format |
---|
# pool reintegrate $ dmg pool reintegrate --pool=$DAOS_POOL --rank=32 Reintegration command succeeded # destroy container $ daos container destroy --pool=$DAOS_POOL --cont=$DAOS_CONT # destroy pool $ dmg pool destroy --pool=$DAOS_POOL Pool-destroy command succeeded # stop clients $ pdsh -S -w $CLIENT_NODES "sudo systemctl stop daos_agent.service" # disable clients $ pdsh -S -w $CLIENT_NODES "sudo systemctl disable daos_agent.service" # stop servers $ pdsh -S -w $SERVER_NODES "sudo systemctl stop daos_server.service" # disable servers $ pdsh -S -w $SERVER_NODES "sudo systemctl disable daos_server.service" |
...