A race in suspend/resume at end of job "succeed" in suspending but leave job in limbo

Description

[root@physics jobs]# qsub -lselect=5:ncpus=4 – /bin/sleep 100

[root@physics jobs]# qsig -s suspend 48

[root@physics jobs]# qstat -swn

physics:
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
------------------------------ --------------- --------------- --------------- -------- ---- ----- ------ ----- - -----
48.physics root workq STDIN 51720 5 20 – – S 00:00:10
mom1/0*4+mom2/0*4+mom3/0*4+mom4/0*4+mom5/0*4
Not Running: PBS Error: System error:

[root@physics jobs]# qsig -s resume 48

09/18/2018 18:13:58;0008;pbs_mom;Job;48.physics;Started, pid = 51720
09/18/2018 18:14:08;0004;pbs_mom;Act;get_wm;libmemacct.so.1 not found
09/18/2018 18:14:19;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1
09/18/2018 18:14:19;0080;pbs_mom;Job;48.physics;signal job request received
09/18/2018 18:14:19;0004;pbs_mom;Job;48.physics;signal job with suspend
09/18/2018 18:14:20;0080;pbs_mom;Job;48.physics;task 00000001 terminated
09/18/2018 18:14:20;0008;pbs_mom;Job;48.physics;Terminated
09/18/2018 18:17:46;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1
09/18/2018 18:17:46;0080;pbs_mom;Job;48.physics;signal job request received
09/18/2018 18:17:46;0004;pbs_mom;Job;48.physics;signal job with resume
09/18/2018 18:17:46;0080;pbs_mom;Req;req_reject;Reject reply code=15010, aux=0, type=18, from root@172.20.54.181:15001
09/18/2018 18:17:47;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1
09/18/2018 18:17:47;0080;pbs_mom;Job;48.physics;signal job request received
09/18/2018 18:17:47;0004;pbs_mom;Job;48.physics;signal job with resume
09/18/2018 18:17:47;0080;pbs_mom;Req;req_reject;Reject reply code=15010, aux=0, type=18, from root@172.20.54.181:15001
09/18/2018 18:17:47;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1
09/18/2018 18:17:47;0080;pbs_mom;Job;48.physics;signal job request received
09/18/2018 18:17:47;0004;pbs_mom;Job;48.physics;signal job with resume
09/18/2018 18:17:47;0080;pbs_mom;Req;req_reject;Reject reply code=15010, aux=0, type=18, from root@172.20.54.181:15001
09/18/2018 18:17:47;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1
09/18/2018 18:17:47;0080;pbs_mom;Job;48.physics;signal job request received
09/18/2018 18:17:47;0004;pbs_mom;Job;48.physics;signal job with resume
09/18/2018 18:17:47;0080;pbs_mom;Req;req_reject;Reject reply code=15010, aux=0, type=18, from root@172.20.54.181:15001
09/18/2018 18:17:47;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1
09/18/2018 18:17:47;0080;pbs_mom;Job;48.physics;signal job request received
09/18/2018 18:17:47;0004;pbs_mom;Job;48.physics;signal job with resume
09/18/2018 18:17:47;0080;pbs_mom;Req;req_reject;Reject reply code=15010, aux=0, type=18, from root@172.20.54.181:15001
09/18/2018 18:17:47;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1
09/18/2018 18:17:47;0080;pbs_mom;Job;48.physics;signal job request received
09/18/2018 18:17:47;0004;pbs_mom;Job;48.physics;signal job with resume
09/18/2018 18:17:47;0080;pbs_mom;Req;req_reject;Reject reply code=15010, aux=0, type=18, from root@172.20.54.181:15001
09/18/2018 18:17:47;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1
09/18/2018 18:17:47;0080;pbs_mom;Job;48.physics;signal job request received
09/18/2018 18:17:47;0004;pbs_mom;Job;48.physics;signal job with resume
09/18/2018 18:17:47;0080;pbs_mom;Req;req_reject;Reject reply code=15010, aux=0, type=18, from root@172.20.54.181:15001
09/18/2018 18:17:47;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1
09/18/2018 18:17:47;0080;pbs_mom;Job;48.physics;signal job request received
09/18/2018 18:17:47;0004;pbs_mom;Job;48.physics;signal job with resume
09/18/2018 18:17:47;0080;pbs_mom;Req;req_reject;Reject reply code=15010, aux=0, type=18, from root@172.20.54.181:15001
09/18/2018 18:17:47;0100;pbs_mom;Req;;Type 18 request received from root@172.20.54.181:15001, sock=1

Acceptance Criteria

None

Status

Assignee

Brem Anand J K

Reporter

Brem Anand J K

Severity

None

OS

None

Start Date

None

Pull Request URL

None

Story Points

1

Components

Fix versions

Affects versions

Priority

Low
Configure