CEPH OSD disk will be in a down state due to the faulty journal disk.
You can verify the CEPH health status by using the ceph -s command.
#ceph -s
cluster:
id: 44a284ba-f302-476b-a969-aea2d70397d1
health: HEALTH_WARN
noscrub,nodeep-scrub flag(s) set
2 nearfull osd(s)
258889/9073545 objects misplaced (2.853%)
Degraded data redundancy: 35059/9073545 objects degraded (0.386%), 571 pgs unclean, 93 pgs degraded, 70 pgs undersized
services:
mon: 4 daemons, quorum ceph4,ceph1,ceph2,ceph3
mgr: ceph4(active), standbys: ceph3, ceph2, ceph1
osd: 71 osds: 55 up, 55 in; 554 remapped pgs
flags nearfull,noscrub,nodeep-scrub
data:
pools: 4 pools, 5696 pgs
objects: 2953k objects, 11994 GB
usage: 36589 GB used, 24858 GB / 61447 GB avail
pgs: 0.018% pgs not active
35059/9073545 objects degraded (0.386%)
258889/9073545 objects misplaced (2.853%)
5125 active+clean
475 active+remapped+backfill_wait
62 active+undersized+degraded+remapped+backfill_wait
15 active+recovery_wait+degraded
8 active+undersized+degraded+remapped+backfilling
5 active+recovery_wait+degraded+remapped
3 active+degraded+remapped+backfill_wait
2 active+remapped+backfilling
1 peering
io:
client: 213 kB/s rd, 175 MB/s wr, 35 op/s rd, 928 op/s wr
recovery: 206 MB/s, 53 objects/s
The CEPH cluster status is in WARN state due to multiple Journal disks failed.
You can use the ceph osd tree command to find the OSD disk status.
[root@ceph1 ceph]# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 77.46774 root default
-3 19.63971 host ceph1
0 hdd 1.09109 osd.0 down 0 1.00000
4 hdd 1.09109 osd.4 down 0 1.00000
7 hdd 1.09109 osd.7 down 0 1.00000
9 hdd 1.09109 osd.9 down 0 1.00000
20 hdd 1.09109 osd.20 down 0 1.00000
22 hdd 1.09109 osd.22 down 0 1.00000
30 hdd 1.09109 osd.30 down 1.00000 1.00000
32 hdd 1.09109 osd.32 down 0 1.00000
40 hdd 1.09109 osd.40 down 0 1.00000
41 hdd 1.09109 osd.41 down 0 1.00000
44 hdd 1.09109 osd.44 up 1.00000 1.00000
45 hdd 1.09109 osd.45 up 1.00000 1.00000
47 hdd 1.09109 osd.47 up 1.00000 1.00000
52 hdd 1.09109 osd.52 down 1.00000 1.00000
53 hdd 1.09109 osd.53 up 1.00000 1.00000
54 hdd 1.09109 osd.54 up 1.00000 1.00000
55 hdd 1.09109 osd.55 up 1.00000 1.00000
59 hdd 1.09109 osd.59 down 0 1.00000
-7 18.54861 host ceph2
1 hdd 1.09109 osd.1 up 1.00000 1.00000
3 hdd 1.09109 osd.3 up 1.00000 1.00000
6 hdd 1.09109 osd.6 up 1.00000 1.00000
10 hdd 1.09109 osd.10 up 1.00000 1.00000
14 hdd 1.09109 osd.14 up 1.00000 1.00000
15 hdd 1.09109 osd.15 up 1.00000 1.00000
17 hdd 1.09109 osd.17 down 0 1.00000
19 hdd 1.09109 osd.19 down 0 1.00000
24 hdd 1.09109 osd.24 down 0 1.00000
26 hdd 1.09109 osd.26 up 1.00000 1.00000
27 hdd 1.09109 osd.27 up 1.00000 1.00000
38 hdd 1.09109 osd.38 up 1.00000 1.00000
48 hdd 1.09109 osd.48 up 1.00000 1.00000
49 hdd 1.09109 osd.49 up 1.00000 1.00000
50 hdd 1.09109 osd.50 up 1.00000 1.00000
56 hdd 1.09109 osd.56 up 1.00000 1.00000
58 hdd 1.09109 osd.58 up 1.00000 1.00000
-5 19.63971 host ceph3
2 hdd 1.09109 osd.2 down 0 1.00000
5 hdd 1.09109 osd.5 down 0 1.00000
8 hdd 1.09109 osd.8 down 0 1.00000
11 hdd 1.09109 osd.11 up 1.00000 1.00000
13 hdd 1.09109 osd.13 up 1.00000 1.00000
16 hdd 1.09109 osd.16 up 1.00000 1.00000
18 hdd 1.09109 osd.18 up 1.00000 1.00000
21 hdd 1.09109 osd.21 up 1.00000 1.00000
28 hdd 1.09109 osd.28 up 1.00000 1.00000
29 hdd 1.09109 osd.29 down 0 1.00000
31 hdd 1.09109 osd.31 down 0 1.00000
34 hdd 1.09109 osd.34 up 1.00000 1.00000
35 hdd 1.09109 osd.35 down 1.00000 1.00000
36 hdd 1.09109 osd.36 up 1.00000 1.00000
37 hdd 1.09109 osd.37 up 1.00000 1.00000
39 hdd 1.09109 osd.39 up 1.00000 1.00000
43 hdd 1.09109 osd.43 up 1.00000 1.00000
46 hdd 1.09109 osd.46 up 1.00000 1.00000
-9 19.63971 host ceph4
12 hdd 1.09109 osd.12 up 1.00000 1.00000
23 hdd 1.09109 osd.23 up 1.00000 1.00000
33 hdd 1.09109 osd.33 up 1.00000 1.00000
42 hdd 1.09109 osd.42 up 1.00000 1.00000
51 hdd 1.09109 osd.51 up 1.00000 1.00000
57 hdd 1.09109 osd.57 up 1.00000 1.00000
60 hdd 1.09109 osd.60 up 1.00000 1.00000
61 hdd 1.09109 osd.61 up 1.00000 1.00000
62 hdd 1.09109 osd.62 up 1.00000 1.00000
63 hdd 1.09109 osd.63 up 1.00000 1.00000
64 hdd 1.09109 osd.64 up 1.00000 1.00000
65 hdd 1.09109 osd.65 up 1.00000 1.00000
66 hdd 1.09109 osd.66 up 1.00000 1.00000
67 hdd 1.09109 osd.67 up 1.00000 1.00000
68 hdd 1.09109 osd.68 up 1.00000 1.00000
69 hdd 1.09109 osd.69 up 1.00000 1.00000
70 hdd 1.09109 osd.70 up 1.00000 1.00000
71 hdd 1.09109 osd.71 up 1.00000 1.00000
You can find the OSD disk status is down on CEPH-1, CEPH-2, and CEPH-3 node.
Use the ceph-disk list command to list disk partitions and Ceph OSDs
[root@ceph2 ~]# ceph-disk list
/dev/dm-0 other, xfs, mounted on /
/dev/dm-1 swap, swap
/dev/sda :
/dev/sda1 ceph data, active, cluster ceph, osd.1, journal /dev/sdt1
/dev/sdb :
/dev/sdb1 ceph data, active, cluster ceph, osd.3, journal /dev/sdt2
/dev/sdc :
/dev/sdc1 ceph data, active, cluster ceph, osd.6, journal /dev/sdt3
/dev/sdd :
/dev/sdd1 ceph data, active, cluster ceph, osd.10, journal /dev/sdu1
/dev/sde :
/dev/sde1 ceph data, active, cluster ceph, osd.14, journal /dev/sdu2
/dev/sdf :
/dev/sdf1 ceph data, active, cluster ceph, osd.15, journal /dev/sdu3
/dev/sdg :
/dev/sdg1 ceph data, active, cluster ceph, osd.17
/dev/sdh :
/dev/sdh1 ceph data, active, cluster ceph, osd.19
/dev/sdi :
/dev/sdi1 ceph data, active, cluster ceph, osd.24
/dev/sdj :
/dev/sdj1 ceph data, active, cluster ceph, osd.25, journal /dev/sdw1
/dev/sdk :
/dev/sdk1 ceph data, active, cluster ceph, osd.26, journal /dev/sdw2
/dev/sdl :
/dev/sdl1 ceph data, active, cluster ceph, osd.27, journal /dev/sdw3
/dev/sdm :
/dev/sdm1 ceph data, active, cluster ceph, osd.58, journal /dev/sdx1
/dev/sdn :
/dev/sdn1 ceph data, active, cluster ceph, osd.48, journal /dev/sdx2
/dev/sdo :
/dev/sdo1 ceph data, active, cluster ceph, osd.38, journal /dev/sdx3
/dev/sdp :
/dev/sdp1 ceph data, active, cluster ceph, osd.49, journal /dev/sdy1
/dev/sdq :
/dev/sdq1 other, vfat, mounted on /boot/efi
/dev/sdq2 other, ext4, mounted on /boot
/dev/sdq3 other, LVM2_member
/dev/sdr :
/dev/sdr1 ceph data, active, cluster ceph, osd.56, journal /dev/sdy2
/dev/sds :
/dev/sds1 ceph data, active, cluster ceph, osd.50, journal /dev/sdy3
/dev/sdt :
/dev/sdt1 ceph journal, for /dev/sda1
/dev/sdt2 ceph journal, for /dev/sdb1
/dev/sdt3 ceph journal, for /dev/sdc1
/dev/sdu :
/dev/sdu1 ceph journal, for /dev/sdd1
/dev/sdu2 ceph journal, for /dev/sde1
/dev/sdu3 ceph journal, for /dev/sdf1
/dev/sdv :
/dev/sdv1 other, ebd0a0a2-b9e5-4433-87c0-68b6b72699c7
/dev/sdv2 other, ebd0a0a2-b9e5-4433-87c0-68b6b72699c7
/dev/sdv3 other, ebd0a0a2-b9e5-4433-87c0-68b6b72699c7
/dev/sdw :
/dev/sdw1 ceph journal, for /dev/sdj1
/dev/sdw2 ceph journal, for /dev/sdk1
/dev/sdw3 ceph journal, for /dev/sdl1
/dev/sdx :
/dev/sdx1 ceph journal, for /dev/sdm1
/dev/sdx2 ceph journal, for /dev/sdn1
/dev/sdx3 ceph journal, for /dev/sdo1
/dev/sdy :
/dev/sdy1 ceph journal, for /dev/sdp1
/dev/sdy2 ceph journal, for /dev/sdr1
/dev/sdy3 ceph journal, for /dev/sds1
[root@ceph2 ~]#
CEPH-2 node is an HPE DL380 Gen9 server. I have replaced the faulty SSD journal disk /dev/sdv. From the above command output, you can find the /dev/sdv not mapped to OSD's.
/dev/sdv1 other, ebd0a0a2-b9e5-4433-87c0-68b6b72699c7
/dev/sdv2 other, ebd0a0a2-b9e5-4433-87c0-68b6b72699c7
/dev/sdv3 other, ebd0a0a2-b9e5-4433-87c0-68b6b72699c7
In my setup, each SSD disk is used as a journal disk for 3 OSD's. In the CEPH-2 node, one SSD journal disk is failed so it affected 3 OSD's (17,19, and 24).
/dev/sdg :
/dev/sdg1 ceph data, active, cluster ceph, osd.17
/dev/sdh :
/dev/sdh1 ceph data, active, cluster ceph, osd.19
/dev/sdi :
/dev/sdi1 ceph data, active, cluster ceph, osd.24
Now I am going to show how to re-create the journal disk on /dev/sdv. Follow the same steps for all faulty journal disks.
1) Before getting started set the OSD flag to 'noout'
ceph osd set noout
2) Stop the OSD where the journal will be changed.
systemctl stop ceph-osd@${osdid}
3) Check if any partition exists on /dev/sdv (new or existing disk)
sgdisk -p /dev/sdv
4) Delete if any partition exists.
sgdisk --delete=1 -- /dev/sdv
sgdisk --delete=2 -- /dev/sdv
sgdisk --delete=3 -- /dev/sdv
5) In my case SSD journal disk is failed so the below command will not work.
ceph-osd -i ${osdid} --flush-journal
Manually delete the journal symbolic linkfrom the failed OSD's
rm /var/lib/ceph/osd/ceph-17/journal
6) Create new journal partition on the new disk.
Execue the below command three time to to create 3 10G partition.
sgdisk --new=0:0:+10240M -- /dev/sdv
7) Verify the partition by using sgdisk command.
sgdisk -p /dev/sdv
8) Note down the old partUUID.
cat /var/lib/ceph/osd/ceph-17/journal_uuid
667f4a63-8b34-459b-b578-4dd12ad7d228
9) Set the proper parameters on the new partition.
sgdisk --change-name=1:"ceph journal" --partition-guid=1:667f4a63-8b34-459b-b578-4dd12ad7d228 --typecode=1:45b0969e-9b03-4f30-b4c6-b4b80ceff106 --mbrtogpt -- /dev/sdv
1 --> Parition number. I created 3 parition so I am using first parititon for OSD.17.
667f4a63-8b34-459b-b578-4dd12ad7d228 --> OSD.17 old partUUID.
45b0969e-9b03-4f30-b4c6-b4b80ceff106 --> Journal UUID, common for all OSD.
10) Create a symbolic link.
ln -s /dev/disk/by-partuuid/667f4a63-8b34-459b-b578-4dd12ad7d228 /var/lib/ceph/osd/ceph-17/journal
11) Check that /dev/disk/by-partuuid/667f4a63-8b34-459b-b578-4dd12ad7d228 is pointing to /dev/sdv1 by using ls -l command.
ls -l /dev/disk/by-partuuid/667f4a63-8b34-459b-b578-4dd12ad7d228
12) Change owner permission to ceph user.
chown ceph:ceph /dev/sdv1
chown -R ceph:ceph /var/lib/ceph/osd/ceph-17/
13) Create the new journal on the new disk partition /dev/sdv1
ceph-osd -i 17 --mkjournal
The above command will create a journal partition for OSD.17 on /dev/sdv1.
14) Start the OSD service for osd.17
systemctl start ceph-osd@17
15) Now check the OSD UP count by using ceph -s.
I repeated the same steps for all faulty journal disks and re-created the journal. Now my cluster health status OK.
[root@ceph1 ~]# ceph -s
cluster:
id: 44a284ba-f302-476b-a969-aea2d70397d1
health: HEALTH_OK
services:
mon: 4 daemons, quorum ceph4,ceph1,ceph2,ceph3
mgr: ceph4(active), standbys: ceph3, ceph2, ceph1
osd: 71 osds: 71 up, 71 in
data:
pools: 4 pools, 5696 pgs
objects: 3096k objects, 12552 GB
usage: 37410 GB used, 40795 GB / 78206 GB avail
pgs: 5696 active+clean
io:
client: 223 kB/s rd, 113 MB/s wr, 12 op/s rd, 2196 op/s wr
#rpc_response_timeout=60
rpc_response_timeout=300