[Beowulf] torque issues...

Fri Aug 26 14:47:41 PDT 2005

Good day all,

I'm having an issue with (I think) pbs_sched.

I submit a very simple job (script at the end of this mail) all it
does is, "echo test>/home/trent/trenttest."  I submit it with, "qsub
pbs_test.sh."  When I watch the queue with qstat -an, I see the job
enter the status Q, then a second later, it starts to run (R) and
get's two nodes (like I asked for), then about 2 minutes later, it
errors (E).

In my script I've sent -o and -e but I never get any output (not my
biggest concern, but I'm not getting any feedback).  If I look at the
server_logs, I can see the job being subitted:

08/26/2005 17:30:21;0008;PBS_Server;Job;16.r111.testcluster.com;Job
Queued at request of trent at r111.testcluster.com, owner =
trent at r111.testcluster.com, job name = PBS_MPI_Test, queue = default
08/26/2005 17:30:21;0040;PBS_Server;Svr;r111.testcluster.com;Scheduler
sent command new
08/26/2005 17:30:21;0008;PBS_Server;Job;16.r111.testcluster.com;Job
Modified at request of Scheduler at r111.testcluster.com
08/26/2005 17:30:21;0008;PBS_Server;Job;16.r111.testcluster.com;Job
Run at request of Scheduler at r111.testcluster.com

then, it looks like the job has ran (I know it has because my file
that I echo >'d to has the info):

08/26/2005 17:32:18;0010;PBS_Server;Job;16.r111.testcluster.com;Exit_status=0
resources_used.cput=00:00:00 resources_used.mem=524kb
resources_used.vmem=7360kb resources_used.walltime=00:01:56

But then I get this:

08/26/2005 17:32:40;000d;PBS_Server;Job;16.r111.testcluster.com;Post
job file processing error; job 16.r111.testcluster.com on host
n02/1+n02/0+n01/1+n01/0

I've googled this quite a bit and it seems like this is some kind of
communication error between the nodes and the headnode.  password-less
ssh and rsh both work from-to all the machines in the cluster.  So I'm
not sure what exactly is failing to communiate.

The /home is nfs mounted to the slave nodes so I don't think it's any
kind of permission error..

If I look at the mom_logs from one of the nodes the job ran on,
everything looks fine:

08/26/2005 14:30:12;0008;   pbs_mom;Job;17.r111.testcluster.com;JOIN
JOB as node 1
08/26/2005 14:30:12;0100;  
pbs_mom;Job;17.r111.testcluster.com;kill_job received
08/26/2005 14:30:12;0001;   pbs_mom;Svr;pbs_mom;im_eof, End of File
from addr 192.168.1.2:1023

This WAS working at one point, then a few people started making
changes and nobody remembers exactly what.  I've rebuilt toruqe from
source (1.2.0p5) and the exact same problem persists.

I'm pretty stumped right now, any advice would be wonderfull!

thanks, deek

here is the script I'm submitting:

#!/bin/sh
#PBS -l nodes=2:ppn=2
#PBS -o /home/trent/output-mytorquejob.log
#PBS -e /home/trent/output-mytorquejob.err
### Queue name
#PBS -q default
### Job name
#PBS -N pbs_test
### Declare job-non-rerunable
#PBS -r n

echo test>/home/trent/trenttest > /home/trent

this is my queue info:

#
# Create queues and set their attributes.
#
#
# Create and define queue high
#
create queue high
set queue high queue_type = Execution
set queue high Priority = 100
set queue high max_running = 50
set queue high max_user_run = 50
#set queue high resources_min.cput = 02:00:00
#set queue high resources_max.cput = 03:00:00
#set queue high resources_default.cput = 02:00:00
set queue high enabled = True
set queue high started = True
#
# Create and define queue default
#
create queue default
set queue default queue_type = Execution
set queue default Priority = 75
set queue default max_running = 50
set queue default max_user_run = 50
#set queue default resources_min.cput = 48:00:00
#set queue default resources_max.cput = 49:00:00
#set queue default resources_default.cput = 48:00:00
set queue default enabled = True
set queue default started = True
#
# Create and define queue medium
#
create queue medium
set queue medium queue_type = Execution
set queue medium Priority = 50
set queue medium max_running = 50
set queue medium max_user_run = 50
#set queue medium resources_min.cput = 168:00:00
#set queue medium resources_max.cput = 169:00:00
#set queue medium resources_default.cput = 168:00:00
set queue medium enabled = True
set queue medium started = True
#
# Create and define queue low
#
create queue low
set queue low queue_type = Execution
set queue low Priority = 25
set queue low max_running = 50
set queue low max_user_run = 50
#set queue low resources_min.cput = 336:00:00
#set queue low resources_max.cput = 337:00:00
#set queue low resources_default.cput = 336:00:00
set queue low enabled = True
set queue low started = True
#
# Set server attributes.
#
set server scheduling = True
set server node_pack = True
set server max_running = 50
set server max_user_run = 50
set server acl_host_enable = True
set server acl_hosts = master.r111.testcluster.com
set server acl_hosts += *.r111.testcluster.com
set server acl_hosts += localhost.localdomain
set server managers = mpiadmin at r111.testcluster.com
set server managers += mpiadmin at r111.testcluster.com
set server managers += root at r111.testcluster.com
set server managers += root at localhost.localdomain
set server default_queue = default
set server log_events = 127
set server mail_from = pbsadmin
set server query_other_jobs = True
set server resources_default.neednodes = 1
set server resources_default.nodect = 1
set server resources_default.nodes = 1
set server scheduler_iteration = 150
set server node_ping_rate = 150
set server node_check_rate = 300
set server comment = 'Torque Server @ r111.testcluster.com'
#
# Define cluster nodes
#
create node r111 np=2,ntype=time-shared,properties="headnode"
create node n01 np=2,ntype=cluster,properties="odd"
create node n02 np=2,ntype=cluster,properties="even"
create node n03 np=2,ntype=cluster,properties="odd"
create node n04 np=2,ntype=cluster,properties="even"
create node n05 np=2,ntype=cluster,properties="odd"
create node n06 np=2,ntype=cluster,properties="even"
create node n07 np=2,ntype=cluster,properties="odd"