GPU as gres
Refer to
Accounting and Limits
Refer to
Core as consumable resource:
sed -i 's/#SelectType=select\/cons_res/SelectType=select\/cons_res/g' /etc/slurm/slurm.conf
sed -i '/SelectType=select\/cons_res/a SelectTypeParameters=CR_Core' /etc/slurm/slurm.conf
Refer to:
Manual resume a State=DOWN node of a cluster
scontrol update NodeName=node10 State=RESUME
Refer to How to “undrain” slurm nodes in drain state
Reference
Reference
Comments: 0
NFS failed to restart due to below mount error
### Error as below:
May 14 00:03:30 rbx06 systemd[1]: dev-disk-by\x2duuid-62ccfba0\x2d6394\x2d42c0\x2dbd38\x2d3da2ea4893b6.device: Job dev-disk-by\x2duuid-62ccfba0\x2d6394\x2d42c0\x2dbd38\x2d3da2ea4893b6.device/start timed out.
May 14 00:03:30 rbx06 systemd[1]: Timed out waiting for device /dev/disk/by-uuid/62ccfba0-6394-42c0-bd38-3da2ea4893b6.
May 14 00:03:30 rbx06 systemd[1]: Dependency failed for /dev/disk/by-uuid/62ccfba0-6394-42c0-bd38-3da2ea4893b6.
### or like this:
Oct 31 20:28:20 thutmose kernel: EXT4-fs (sdc1): mounting ext3 file system using the ext4 subsystem
Oct 31 20:28:20 thutmose kernel: EXT4-fs (sdc1): warning: maximal mount count reached, running e2fsck is recommended
Oct 31 20:28:20 thutmose kernel: EXT4-fs (sdc1): mounted filesystem with ordered data mode. Opts: (null)
Oct 31 20:28:20 thutmose sudo[18994]: pam_unix(sudo:session): session closed for user root
Oct 31 20:28:20 thutmose systemd[1]: mnt-attorney.mount: Unit is bound to inactive unit dev-disk-by\x2dlabel-attorney.device. Stopping, too.
Oct 31 20:28:20 thutmose systemd[1]: Unmounting /mnt/attorney...
Oct 31 20:28:21 thutmose systemd[1]: Unmounted /mnt/attorney.
Solution:
# run this command
systemd daemon-reload
## restart rpcbind and nfs
exportfs -a # /etc/exports was updated
systemctl restart rpcbind nfs
Refer to:
Comments: 0
990 mkdir disk-checklog-20201216
991 cd disk-checklog-20201216
# get smart info for DID 30 of /dev/sdb
992 smartctl -s on -a -d megaraid,30 /dev/sdb > sdb_DID30.smartctl.txt
993 smartctl -s on -a -d megaraid,31 /dev/sdb > sdb_DID30.smartctl.txt
994 smartctl -s on -a -d megaraid,30 /dev/sdb > sdb_DID30.smartctl.txt
995 smartctl -s on -a -d megaraid,31 /dev/sdb > sdb_DID31.smartctl.txt
996 smartctl -s on -a -d megaraid,32 /dev/sdb > sdb_DID32.smartctl.txt
997 ls
998 diff sdb_DID3{0,1}*
999 ls
1000 diff sdb_DID30.smartctl.txt ../disk-checklog/sdb_DID30.smartctl
1001 ls
1002 cd ginpie/
1003 ls
1004 cd disk-checklog-20201216/
1005 diff sdb_DID30.smartctl.txt ../disk-checklog/sdb_DID30.smartctl
1006 grep -i error sdb_DID3*
1007 grep 'No Errors' sdb_DID3*
1008 less sdb_DID32.smartctl.txt
1009 less sdb_DID31.smartctl.txt
1010 less sdb_DID30.smartctl.txt
1011 less sdb_DID31.smartctl.txt
1012 less sdb_DID30.smartctl.txt
1013 ls
# show all hard drives info
1014 /opt/MegaRAID/storcli/storcli64 /c0 show > c0_show.txt
1015 less sdb_DID30.smartctl.txt
1016 ls
1017 ls ../disk-checklog
# LED Mark up a HDD example:
# c0 i.e. control 0 , eid=27, slot=0
/opt/MegaRAID/storcli/storcli64 /c0/e27/s0 start locate
Comments: 0
# create raid1 using /dev/sda1 and /dev/sdb1
mdadm --create /dev/md0 --level=mirror --raid-devices=2 /dev/sd[ab]1
# List raid1 info
mdadm --detail /dev/md0
# check raid config in /dev/sd[ab]
mdadm -E /dev/sd[ab]
# Then install CentOS on /dev/md0
Refer to:
- Setting up RAID 1 (Mirroring) using ‘Two Disks’ in Linux – Part 3
Comments: 0
# enable ipv4 forward
echo 1 > /proc/sys/net/ipv4/ip_forward
# CentOS 6
iptables -t nat -A POSTROUTING -o XXXX -j MASQUERADE
CentOS/RHEL 7
$ firewall-cmd --add-masquerade --permanent
$ firewall-cmd --reload
Refer to:
Comments: 0
Install tools for IPMI
yum -y install OpenIPMI OpenIPMI-devel ipmitool freeipmi
[root@sr2500 ~]# ipmitool lan set 1 ipsrc static
[root@sr2500 ~]# ipmitool lan set 1 ipaddr 192.168.1.211
Setting LAN IP Address to 192.168.1.211
[root@sr2500 ~]# ipmitool lan set 1 netmask 255.255.255.0
Setting LAN Subnet Mask to 255.255.255.0
[root@sr2500 ~]# ipmitool lan set 1 defgw ipaddr 192.168.1.254
Setting LAN Default Gateway IP to 192.168.1.254
# check the configuration
397 ipmitool lan print 1
# print sensor data
ipmitool sensor list
IPMI check server's status
## IPMI List CPU status
$ ipmitool -I lanplus -H bmc-ip -U user -P pass sdr | grep -i CPU | grep -i Status
Refer to:
Comments: 0
Dec 18, 2020 | 4278 views
Create RAID10:
# Create RAID10,
# HELP sudo storcli /cx add vd type=[RAID0(r0)|RAID1(r1)|...] drives=[EnclosureID:SlotID|:SlotID-SlotID|:SlotID,SlotID]
820 storcli64 /c0 add vd r10 drives=8:0,1,2,3 pdperarray=2
# find which virtual group is raid10, which is v0 in here
821 storcli64 /c0 show
822 storcli64 /c0/v0 show
823 storcli64 /c0/v0 show all
824 storcli64 /c0/v0 show all | less
# initialize RAID10
825 storcli64 /c0/v0 start init
826 storcli64 /c0/v0 show init
827 storcli64 /c0/v0 show all | less
828 storcli64 /c0/vall show all | less
Create RAID 6:
# Create RAID 6
831 storcli64 /c0 add vd type=raid6 drives=8:4-15
832 storcli64 /c0/vall show all | less
# find which virtual device id is raid 6 just created
833 storcli64 /c0/vall show
834 storcli64 /c0/v2 show
835 storcli64 /c0/v2 show all
836 storcli64 /c0 show all
838 storcli64 /c0/v2 show all
839 storcli64 /c0/v2 show all | more
# initialize RAID6
841 storcli64 /c0/v2 start init
842 storcli64 /c0/v2 show init
843 storcli64 /c0/v2 show
844 storcli64 /c0/v2 show all
845 storcli64 /c0/vall show
Example of Creating RAID 50, RAID 60
StorCLI:
# RAID 10
C:\>storcli64 /C0 add vd type=raid10 drives=83:5,6,7,8 pdperarray=2
# RAID 50
C:\>storcli64 /C0 add vd type=raid50 drives=83:5,6,7,8,9,10 pdperarray=3
# RAID 60
C:\>storcli64 /C0 add vd type=raid60 drives=83:5,6,7,8,9,10,11,12 pdperarray=4
Refer to
Comments: 0