Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
languagebash
# After DAOS servers and DAOS admin and client RPMs loaded

$ dmg storage format
Format Summary:
  Hosts             SCM Devices NVMe Devices 
  -----             ----------- ------------ 
  boro-[8,35,52-53] 1           0            

$ dmg pool list
Pool UUID Svc Replicas 
--------- ------------ 
733bee7b-c2af-499e-99dd-313b1ef092a9 
[1-3] 

$ daos cont create --pool=$DAOS_POOL --type=POSIX --oclass=RP_3G1 --properties=rf:2
Successfully created container 2649aa0f-3ad7-4943-abf5-4343205a637b 

$ daos pool list-cont --pool=$DAOS_POOL
2649aa0f-3ad7-4943-abf5-4343205a637b

$ dmg pool query --pool=$DAOS_POOL 
Pool 733bee7b-c2af-499e-99dd-313b1ef092a9, ntarget=32, disabled=0, leader=2, version=1 
Pool space info: 
- Target(VOS) count:32 
- SCM: 
  Total size: 5.0 GB 
  Free: 5.0 GB, min:156 MB, max:156 MB, mean:156 MB 
- NVMe: 
  Total size: 0 B 
  Free: 0 B, min:0 B, max:0 B, mean:0 B 
Rebuild idle, 0 objs, 0 recs

$ df -h -t fuse.daos
df: no file systems processed

$ mkdir /tmp/daos_test1

$ dfuse --mmountpoint=/tmp/daos_test1 --pool=70f73efc-848e-4f6e-b4fd-909bcf9bd427$DAOS_POOL --cont=cf2a95ce-9910-4d5e-814c-cafb0a7f0944$DAOS_CONT

$ df -h -t fuse.daos
Filesystem      Size  Used Avail Use% Mounted on
dfuse            19G  1.1M   19G   1% /tmp/daos_test1

$ fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --directory=/tmp/daos_test1 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting
random-write: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=pvsync, iodepth=16
...
fio-3.7
Starting 8 processes
random-write: Laying out IO files (4 files / total 128MiB)
random-write: Laying out IO files (4 files / total 128MiB)
random-write: Laying out IO files (4 files / total 128MiB)
random-write: Laying out IO files (4 files / total 128MiB)
random-write: Laying out IO files (4 files / total 128MiB)
random-write: Laying out IO files (4 files / total 128MiB)
random-write: Laying out IO files (4 files / total 128MiB)
random-write: Laying out IO files (4 files / total 128MiB)
Jobs: 8 (f=32): [w(8)][100.0%][r=0KiB/s,w=96.1MiB/s][r=0,w=24.6k IOPS][eta 00m:00s]
random-write: (groupid=0, jobs=8): err= 0: pid=27879: Sat Apr 17 01:12:57 2021
  write: IOPS=24.4k, BW=95.3MiB/s (99.9MB/s)(5716MiB/60001msec)
    clat (usec): min=220, max=6687, avg=326.19, stdev=55.29
     lat (usec): min=220, max=6687, avg=326.28, stdev=55.29
    clat percentiles (usec):
     |  1.00th=[  260],  5.00th=[  273], 10.00th=[  285], 20.00th=[  293],
     | 30.00th=[  306], 40.00th=[  314], 50.00th=[  322], 60.00th=[  330],
     | 70.00th=[  338], 80.00th=[  355], 90.00th=[  375], 95.00th=[  396],
     | 99.00th=[  445], 99.50th=[  465], 99.90th=[  523], 99.95th=[  562],
     | 99.99th=[ 1827]
   bw (  KiB/s): min=10976, max=12496, per=12.50%, avg=12191.82, stdev=157.87, samples=952
   iops        : min= 2744, max= 3124, avg=3047.92, stdev=39.47, samples=952
  lat (usec)   : 250=0.23%, 500=99.61%, 750=0.15%, 1000=0.01%
  lat (msec)   : 2=0.01%, 4=0.01%, 10=0.01%
  cpu          : usr=0.81%, sys=1.69%, ctx=1463535, majf=0, minf=308
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=0,1463226,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=16

Run status group 0 (all jobs):
  WRITE: bw=95.3MiB/s (99.9MB/s), 95.3MiB/s-95.3MiB/s (99.9MB/s-99.9MB/s), io=5716MiB (5993MB), run=60001-60001msec

...

Run dfuse with

...

rebuild

Code Block
languagebash
# Start dfuse
$ fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --directory=/tmp/daos_test1 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting

...

No Format


random-write: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=pvsync, iodepth=16
...
fio-3.7
Starting 8 processes
fio: io_u error on file /tmp/daos_test1/random-write.2.1: Input/output error: write offset=8527872, buflen=4096
fio: pid=28242, err=5

file:io_u.c:1747
bw ( KiB/s): min= 3272, max=12384, per=30.14%, avg=11624.50, stdev=2181.01, samples=128
iops : min= 818, max= 3096, avg=2906.12, stdev=545.25, samples=128
lat (usec) : 250=0.23%, 500=99.59%, 750=0.12%, 1000=0.01%
lat (msec) : 2=0.03%, 4=0.02%
cpu : usr=0.27%, sys=0.66%, ctx=186210, majf=0, minf=494
IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.1%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
issued rwts: total=0,186000,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=16
Run status group 0 (all jobs):
WRITE: bw=37.7MiB/s (39.5MB/s), 37.7MiB/s-37.7MiB/s (39.5MB/s-39.5MB/s), io=727MiB (762MB), run=19291-19291msec
...

Code Block
languagebash

...

# from daos_admin console, stop leader-rank with debug
$ dmg -d system stop --ranks=3

...


DEBUG 01:34:58.026753 main.go:217: debug output enabled

...


DEBUG 01:34:58.027457 main.go:244: control config loaded from /etc/daos/daos_control.yml

...


Rank Operation Result

...


--------- ------

...


3 stop OK

...



$ daos pool list-cont --pool=$DAOS_POOL

...

No Format

cf2a95ce-9910-4d5e-814c-cafb0a7f0944

$ dmg pool query --pool=733bee7b-c2af-499e-99dd-313b1ef092a9$DAOS_POOL
Pool 733bee7b70f73efc-c2af848e-499e4f6e-99ddb4fd-313b1ef092a9909bcf9bd427,
ntarget=32,
disabled=08,
leader=2,
version=118
Pool space info:
- Target(VOS) count:3224
- SCM:
  Total size: 5.015 GB
  Free: 5.014 GB, min:156575 MB, max:156597 MB, mean:156587 MB
- NVMe:

 Total size: 0 B
  Free: 0 B, min:0 B, max:0 B, mean:0 B
Rebuild idledone, 01 objs, 057 recs

$ dmg pool list
Pool UUID Svc Replicas

...



# Verify stopped server been evicted
$ dmg system query -v
Rank UUID Control Address Fault Domain State Reason
---- --------------- -----

...

70f73efc-848e-4f6e-b4fd-909bcf9bd427 [1-2]

$ daos pool list-cont --pool=$DAOS_POOL

cf2a95ce-9910-4d5e-814c-cafb0a7f0944

$ dmg pool query --pool=$DAOS_POOL

Pool 70f73efc-848e-4f6e-b4fd-909bcf9bd427,

ntarget=32,

disabled=8,

leader=2,

version=18

Pool space info:

Target(VOS) count:24

SCM:
Total size: 15 GB
Free: 14 GB, min:575 MB, max:597 MB, mean:587 MB

NVMe:
Total size: 0 B
Free: 0 B, min:0 B, max:0 B, mean:0 B
Rebuild done, 1 objs, 57 recs

...

------- ----- ------
0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 boro-8.boro.hpdd.intel.com Joined
1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.

...

35:10001 boro-35.boro.hpdd.intel.com Joined

...


2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 boro-53.boro.hpdd.intel.com Joined

...


3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 boro-52.boro.hpdd.intel.com Evicted system stop
Code Block
languagebash
# Restart, after evicted server restarted, verify the server joined
$ /usr/bin/dmg system query -v

...


Rank UUID Control Address Fault Domain State Reason

...

 
----

...

 -------

...

$ df -h -t fuse.daos
Filesystem Size Used Avail Use% Mounted on
dfuse 14G 867M 14G 7% /tmp/daos_test1

$ fio --name=random-write --ioengine=pvsync --rw=randwrite --bs=4k --size=128M --nrfiles=4 --directory=/tmp/daos_test1 --numjobs=8 --iodepth=16 --runtime=60 --time_based --direct=1 --buffered=0 --randrepeat=0 --norandommap --refill_buffers --group_reporting
random-write: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=pvsync, iodepth=16
...
fio-3.7
Starting 8 processes file:filesetup.c:349, func=fstat, error=Input/output error
Run status group 0 (all jobs):

...

-

...

$ dmg pool list
Pool UUID Svc Replicas

------- -----

...

-

...

-

...

-

...

-

...

-

...

-

...

- -

...

-

...

-

...

-

...

-

...

 -

...

--

...

--

...

$ dmg -o /etc/daos/daos_control.yml -d system stop --ranks=2

No Format
DEBUG 02:01:54.916742 main.go:217: debug output enabled
DEBUG 02:01:54.917508 main.go:244: control config loaded from /etc/daos/daos_control.yml
DEBUG 02:01:54.920913 system.go:568: DAOS system stop request: &{unaryRequest:{request:{deadline:{wall:0 ext:0 loc:<nil>} Sys: HostList:[]} rpc:0xd1cf60} msRequest:{} sysRequest:{Ranks:{RWMutex:{w:{state:0 sema:0} writerSem:0 readerSem:0 readerCount:0 readerWait:0} HostSet:{Mutex:{state:0 sema:0} list:0xc00029d340}} Hosts:{Mutex:{state:0 sema:0} list:0xc00029d300}} Prep:true Kill:true Force:false}
DEBUG 02:01:54.921844 rpc.go:196: request hosts: [boro-8:10001 boro-35:10001] 
Rank Operation Result
- 
0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 /boro-8.boro.hpdd.intel.com Joined 
1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 /boro-35.boro.hpdd.intel.com Joined 
2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 /boro-53.boro.hpdd.intel.com Joined 
3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 /boro-52.boro.hpdd.intel.com Joined

# Unmount after test completed
$ fusermount -u /tmp/daos_test1/ 
$ df -h -t fuse.daos 
df: no file systems processed


Run mpirun mdtest with rebuild

Code Block
languagebash
$ dmg pool create --size=50G
Creating DAOS pool with automatic storage allocation: 50 GB NVMe + 6.00% SCM 
Pool created with 100.00% SCM/NVMe ratio 
-----------------------------------------
 UUID : 4eda8a8c-028c-461c-afd3----
2 stop OK

$ dmg pool create --size=50G

No Format
Creating DAOS pool with automatic storage allocation704534961572
 Service Ranks : [1-3]
 Storage Ranks : [0-3]
 Total Size : 50 GB
 SCM : 50 GB (12 GB / rank)
 NVMe + 6.00% SCM
Pool created with 100.00% SCM/NVMe ratio
-----------------------------------------
  UUID          : 4eda8a8c-028c-461c-afd3-704534961572
  Service Ranks : [1-3]                               
  Storage Ranks : [0-3]                               
  Total Size    : 50 GB                               
  SCM           : 50 GB (12 GB / rank)                
  NVMe          : 0 B (0 B / rank)

$ daos cont create --pool=$DAOS_POOL --type=POSIX --oclass=RP_3G1 --properties=rf:2

Successfully created container d71ff6a5-15a5-43fe-b829-bef9c65b9ccb

Run mpirun mdtest with rebuild

$ /usr/lib64/mpich/bin/mpirun -host boro-8 -np 30 mdtest -a DFS -z 0 -F -C -i 100 -n 1667 -e 4096 -d / -w 4096 --dfs.chunk_size 1048576 --dfs.cont $DAOS_CONT --dfs.destroy --dfs.dir_oclass RP_3G1 --dfs.group daos_server --dfs.oclass RP_3G1 --dfs.pool $DAOS_POOL

started at 04/22/2021 17:46:20 –
mdtest-3.4.0+dev was launched with 30 total task(s) on 1 node(s)
Command line used: mdtest 'a' 'DFS' '-z' '0' '-F' '-C' '-i' '100' '-n' '1667' '-e' '4096' '-d' '/' '-w' '4096' 'dfs.chunk_size' '1048576' 'dfs.cont' 'd71ff6a5-15a5-43fe-b829-bef9c65b9ccb' 'dfs.destroy' 'dfs.dir_oclass' 'RP_3G1' 'dfs.group' 'daos_server' 'dfs.oclass' 'RP_3G1' '-dfs.pool' '4eda8a8c-028c-461c-afd3-704534961572'
WARNING: unable to use realpath() on file system.
Path:
FS: 0.0 GiB Used FS: -nan% Inodes: 0.0 Mi Used Inodes: -nan%
Nodemap: 111111111111111111111111111111
30 tasks, 50010 files

$ dmg system stop --ranks=3

...

 : 0 B (0 B / rank)

$ daos cont create --pool=$DAOS_POOL --type=POSIX --oclass=RP_3G1 --properties=rf:2
Successfully created container d71ff6a5-15a5-43fe-b829-bef9c65b9ccb

$ /usr/lib64/mpich/bin/mpirun -host boro-8 -np 30 mdtest -a DFS -z 0 -F -C -i 100 -n 1667 -e 4096 -d / -w 4096 --dfs.chunk_size 1048576 --dfs.cont $DAOS_CONT --dfs.destroy --dfs.dir_oclass RP_3G1 --dfs.group daos_server --dfs.oclass RP_3G1 --dfs.pool $DAOS_POOL

started at 04/22/2021 17:46:20 –
mdtest-3.4.0+dev was launched with 30 total task(s) on 1 node(s)
Command line used: mdtest 'a' 'DFS' '-z' '0' '-F' '-C' '-i' '100' '-n' '1667' '-e' '4096' '-d' '/' '-w' '4096' 'dfs.chunk_size' '1048576' 'dfs.cont' 'd71ff6a5-15a5-43fe-b829-bef9c65b9ccb' 'dfs.destroy' 'dfs.dir_oclass' 'RP_3G1' 'dfs.group' 'daos_server' 'dfs.oclass' 'RP_3G1' '-dfs.pool' '4eda8a8c-028c-461c-afd3-704534961572'
WARNING: unable to use realpath() on file system.
Path:
FS: 0.0 GiB Used FS: -nan% Inodes: 0.0 Mi Used Inodes: -nan%
Nodemap: 111111111111111111111111111111
30 tasks, 50010 files
...
Code Block
languagebash
# from daos_admin console, stop a server rank
$ dmg system stop --ranks=2
Rank Operation Result
--------- ------
2 stop OK

# Verify stopped server been evicted 
$ dmg system query -v 
Rank UUID Control Address Fault Domain State Reason
 ---- --------------- ------------ ----- ------
 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 boro-8.boro.hpdd.intel.com Joined
 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 boro-35.boro.hpdd.intel.com Joined
 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 boro-53.boro.hpdd.intel.com Evicted system stop
 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 boro-52.boro.hpdd.intel.com Joined
No Format
# Restart, after evicted server restarted, verify the server joined
$ /usr/bin/dmg system query -v
 Rank UUID Control Address Fault Domain State Reason
 ---- --------------- ------------ ----- ------
 0 2bf0e083-33d6-4ce3-83c4-c898c2a7ddbd 10.7.1.8:10001 /boro-8.boro.hpdd.intel.com Joined
 1 c9ac1dd9-0f9d-4684-90d3-038b720fd26b 10.7.1.35:10001 /boro-35.boro.hpdd.intel.com Joined
 2 80e44fe9-3a2b-4808-9a0f-88c3cbe7f565 10.7.1.53:10001 /boro-53.boro.hpdd.intel.com Joined
 3 a26fd44a-6089-4cc3-a06b-278a85607fd3 10.7.1.52:10001 /boro-52.boro.hpdd.intel.com Joined


Clean-Up

No Format
# pool reintegrate
$ dmg pool reintegrate --pool=$DAOS_POOL --rank=32
Reintegration command succeeded

# destroy container
$ daos container destroy --pool=$DAOS_POOL --cont=$DAOS_CONT

# destroy pool
$ dmg pool destroy --pool=$DAOS_POOL
Pool-destroy command succeeded

# stop clients
$ pdsh -S -w $CLIENT_NODES "sudo systemctl stop daos_agent.service"

# disable clients
$ pdsh -S -w $CLIENT_NODES "sudo systemctl disable daos_agent.service"

# stop servers
$ pdsh -S -w $SERVER_NODES "sudo systemctl stop daos_server.service"

# disable servers
$ pdsh -S -w $SERVER_NODES "sudo systemctl disable daos_server.service"



...