1. Setup failover. ------------------- On the primary [root@d_server pbspro]# cat /etc/pbs.conf PBS_SERVER=d_server PBS_START_SERVER=1 PBS_START_SCHED=1 PBS_START_COMM=1 PBS_START_MOM=1 PBS_EXEC=/opt/pbs PBS_HOME=/home/prakv/pbspro/shared_home PBS_CORE_LIMIT=unlimited PBS_SCP=/bin/scp PBS_PRIMARY=d_server PBS_SECONDARY=d_server2 [root@d_server pbspro]# On the secondary [root@d_server2 pbspro]# cat /etc/pbs.conf PBS_SERVER=d_server PBS_START_SERVER=1 PBS_START_SCHED=0 PBS_START_COMM=0 PBS_START_MOM=0 PBS_EXEC=/opt/pbs PBS_HOME=/home/prakv/pbspro/shared_home PBS_CORE_LIMIT=unlimited PBS_SCP=/bin/scp PBS_PRIMARY=d_server PBS_SECONDARY=d_server2 [root@d_server2 pbspro]# 2. Check that the scheduler is not running on the secondary host. ----------------------------------------------------------------- [root@d_server2 pbspro]# !ps ps -elf | grep pbs 5 S root 13915 1 0 80 0 - 29287 ep_pol 10:34 ? 00:00:00 /opt/pbs/sbin/pbs_server.bin 0 S root 13919 22 0 80 0 - 3118 pipe_w 12:43 ? 00:00:00 grep --color=auto pbs [root@d_server2 pbspro]# 3. Kill server process on primary --------------------------------- [root@d_server pbspro]# ps -elf | grep pbs 5 Z root 25286 1 0 80 0 - 0 exit 10:31 ? 00:00:00 [pbs_sched] 5 S root 26857 1 0 80 0 - 92468 hrtime 12:19 ? 00:00:00 /opt/pbs/sbin/pbs_comm 5 S root 26887 1 0 80 0 - 47002 ep_pol 12:19 ? 00:00:00 /opt/pbs/sbin/pbs_mom 5 S root 27183 1 0 80 0 - 18078 hrtime 12:20 ? 00:00:00 /opt/pbs/sbin/pbs_ds_monitor monitor 0 S postgres 27229 1 0 80 0 - 33680 poll_s 12:20 ? 00:00:00 /usr/bin/postgres -D /home/prakv/pbspro/shared_home/datastore -p 15007 5 t root 27342 1 0 80 0 - 37013 ptrace 12:20 ? 00:00:00 /opt/pbs/sbin/pbs_sched 1 S postgres 28109 27229 0 80 0 - 34440 sk_wai 12:23 ? 00:00:00 postgres: postgres pbs_datastore 172.17.0.2(33260) idle 5 S root 28110 1 0 80 0 - 57573 ep_pol 12:23 ? 00:00:00 /opt/pbs/sbin/pbs_server.bin 0 S root 28136 22 0 80 0 - 3118 pipe_w 12:42 ? 00:00:00 grep --color=auto pbs [root@d_server pbspro]# kill -9 28110 4. verify that the server process is not running on the primary --------------------------------------------------------------- [root@d_server pbspro]# !ps ps -elf | grep pbs 5 Z root 25286 1 0 80 0 - 0 exit 10:31 ? 00:00:00 [pbs_sched] 5 S root 26857 1 0 80 0 - 92468 hrtime 12:19 ? 00:00:00 /opt/pbs/sbin/pbs_comm 5 S root 26887 1 0 80 0 - 47002 ep_pol 12:19 ? 00:00:00 /opt/pbs/sbin/pbs_mom 5 S root 27183 1 0 80 0 - 18078 hrtime 12:20 ? 00:00:00 /opt/pbs/sbin/pbs_ds_monitor monitor 0 S postgres 27229 1 0 80 0 - 33680 poll_s 12:20 ? 00:00:00 /usr/bin/postgres -D /home/prakv/pbspro/shared_home/datastore -p 15007 5 t root 27342 1 0 80 0 - 37013 ptrace 12:20 ? 00:00:00 /opt/pbs/sbin/pbs_sched 0 S root 28140 22 0 80 0 - 3118 pipe_w 12:43 ? 00:00:00 grep --color=auto pbs 5. Wait for the secondary to take over and confirm. --------------------------------------------------- [root@d_server2 pbspro]# qstat -Bf Server: d_server server_state = Active server_host = d_server2 scheduling = True total_jobs = 0 state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 Begun :0 acl_roots = root default_queue = workq log_events = 511 mail_from = adm query_other_jobs = True resources_default.ncpus = 1 default_chunk.ncpus = 1 scheduler_iteration = 600 FLicenses = 20000000 resv_enable = True node_fail_requeue = 310 max_array_size = 10000 pbs_license_min = 0 pbs_license_max = 2147483647 pbs_license_linger_time = 31536000 license_count = Avail_Global:10000000 Avail_Local:10000000 Used:0 High_Use: 0 pbs_version = 18.1.0 eligible_time_enable = False max_concurrent_provision = 5 power_provisioning = False [root@d_server2 pbspro]# 6. Verify that the schduler process is spawned on the secondary host. ---------------------------------------------------------------------- [root@d_server2 pbspro]# !ps ps -elf | grep pbs 5 S root 13915 1 0 80 0 - 57572 ep_pol 10:34 ? 00:00:00 /opt/pbs/sbin/pbs_server.bin 5 S root 14212 1 0 80 0 - 18077 hrtime 12:44 ? 00:00:00 /opt/pbs/sbin/pbs_ds_monitor monitor 0 S postgres 14246 1 0 80 0 - 33679 poll_s 12:44 ? 00:00:00 /usr/bin/postgres -D /home/prakv/pbspro/shared_home/datastore -p 15007 1 S postgres 14263 14246 0 80 0 - 34424 sk_wai 12:44 ? 00:00:00 postgres: postgres pbs_datastore 172.17.0.3(59922) idle 1 S root 14269 1 0 80 0 - 36987 poll_s 12:44 ? 00:00:00 /opt/pbs/sbin/pbs_sched 0 R root 14272 22 0 80 0 - 2227 - 12:44 ? 00:00:00 grep --color=auto pbs [root@d_server2 pbspro]# 7. Submit a job. ---------------- [root@d_server2 pbspro]# qsub -- /bin/sleep 1 1.d_server 8. Verify that the job ran. --------------------------- [root@d_server2 pbspro]# qstat -s d_server: Req'd Req'd Elap Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time --------------- -------- -------- ---------- ------ --- --- ------ ----- - ----- 1.d_server root workq STDIN 28273 1 1 -- -- E 00:00 Job run at Thu May 03 at 12:44 on (d_server:ncpus=1) [root@d_server2 pbspro]# 9. Check server logs to verify that the job was run on request from scheduler on secondary host. ------------------------------------------------------------------------------------------------ 05/03/2018 12:44:50;0100;Server@d_server2;Job;1.d_server;enqueuing into workq, state 1 hop 1 05/03/2018 12:44:50;0008;Server@d_server2;Job;1.d_server;Job Queued at request of root@d_server2, owner = root@d_server2, job name = STDIN, queue = workq 05/03/2018 12:44:50;0040;Server@d_server2;Svr;d_server;Scheduler sent command 1 05/03/2018 12:44:50;0040;Server@d_server2;Svr;d_server;Scheduler sent command 0 05/03/2018 12:44:50;0100;Server@d_server2;Req;;Type 21 request received from Scheduler@d_server2, sock=18 05/03/2018 12:44:50;0100;Server@d_server2;Req;;Type 81 request received from Scheduler@d_server2, sock=18 05/03/2018 12:44:50;0100;Server@d_server2;Req;;Type 71 request received from Scheduler@d_server2, sock=18 05/03/2018 12:44:50;0100;Server@d_server2;Req;;Type 58 request received from Scheduler@d_server2, sock=18 05/03/2018 12:44:50;0100;Server@d_server2;Req;;Type 20 request received from Scheduler@d_server2, sock=18 05/03/2018 12:44:50;0100;Server@d_server2;Req;;Type 51 request received from Scheduler@d_server2, sock=18 05/03/2018 12:44:50;0100;Server@d_server2;Req;;Type 23 request received from Scheduler@d_server2, sock=18 05/03/2018 12:44:50;0008;Server@d_server2;Job;1.d_server;Job Run at request of Scheduler@d_server2 on exec_vnode (d_server:ncpus=1) 05/03/2018 12:44:51;0080;Server@d_server2;Job;1.d_server;Obit received momhop:1 serverhop:1 state:4 substate:42 10. Start the server on primary host. ------------------------------------- [root@d_server pbspro]# /etc/init.d/pbs start Starting PBS PBS comm already running. PBS mom already running. PBS scheduler already running. Notifying Secondary Server that we are taking over Have taken control from Secondary Server Connecting to PBS dataservice.....connected to PBS dataservice@d_server Licenses valid for 10000000 Floating hosts PBS server [root@d_server pbspro]# date Thu May 3 12:46:03 UTC 2018 [root@d_server pbspro]# 11. Verify that the primary has taken over. ------------------------------------------- [root@d_server /]# qstat -Bf Server: d_server server_state = Active server_host = d_server scheduling = True total_jobs = 0 state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 Begun :0 acl_roots = root default_queue = workq log_events = 511 mail_from = adm query_other_jobs = True resources_default.ncpus = 1 default_chunk.ncpus = 1 resources_assigned.ncpus = 0 resources_assigned.nodect = 0 scheduler_iteration = 600 FLicenses = 20000000 resv_enable = True node_fail_requeue = 310 max_array_size = 10000 pbs_license_min = 0 pbs_license_max = 2147483647 pbs_license_linger_time = 31536000 license_count = Avail_Global:10000000 Avail_Local:10000000 Used:0 High_Use: 0 pbs_version = 18.1.0 eligible_time_enable = False max_concurrent_provision = 5 power_provisioning = False [root@d_server /]# 12. Check scheduler logs to verify that the scheduler reloaded the configuration. --------------------------------------------------------------------------------- 05/03/2018 12:45:48;0040;pbs_sched;Sched;reconfigure;Scheduler is reconfiguring 05/03/2018 12:45:48;0040;pbs_sched;Fil;sched_config;Obsolete config name sort_queues 05/03/2018 12:45:48;0004;pbs_sched;Fil;holidays;The holiday file is out of date; please update it. 13. Confirm that the newly spawned scheduler process on secondary host was made to quit. ---------------------------------------------------------------------------------------- [root@d_server2 pbspro]# !ps ps -elf | grep pbs 5 S root 14634 1 0 80 0 - 29287 ep_pol 12:45 ? 00:00:00 /opt/pbs/sbin/pbs_server.bin 0 S root 14636 22 0 80 0 - 3118 pipe_w 13:06 ? 00:00:00 grep --color=auto pbs [root@d_server2 pbspro]#