[Beowulf] torque issues...

Mike Davis jmdavis at mail2.vcu.edu
Fri Aug 26 16:40:51 PDT 2005


Deek,

It may be trying to scp when the /home dir that you are writing to is 
nfs mounted. There should be a section in the manual about this. I'm at 
home and don't have any docs here.

Mike

decon brody wrote:

>Good day all,
>
>I'm having an issue with (I think) pbs_sched.
>
>I submit a very simple job (script at the end of this mail) all it
>does is, "echo test>/home/trent/trenttest."  I submit it with, "qsub
>pbs_test.sh."  When I watch the queue with qstat -an, I see the job
>enter the status Q, then a second later, it starts to run (R) and
>get's two nodes (like I asked for), then about 2 minutes later, it
>errors (E).
>
>In my script I've sent -o and -e but I never get any output (not my
>biggest concern, but I'm not getting any feedback).  If I look at the
>server_logs, I can see the job being subitted:
>
>08/26/2005 17:30:21;0008;PBS_Server;Job;16.r111.testcluster.com;Job
>Queued at request of trent at r111.testcluster.com, owner =
>trent at r111.testcluster.com, job name = PBS_MPI_Test, queue = default
>08/26/2005 17:30:21;0040;PBS_Server;Svr;r111.testcluster.com;Scheduler
>sent command new
>08/26/2005 17:30:21;0008;PBS_Server;Job;16.r111.testcluster.com;Job
>Modified at request of Scheduler at r111.testcluster.com
>08/26/2005 17:30:21;0008;PBS_Server;Job;16.r111.testcluster.com;Job
>Run at request of Scheduler at r111.testcluster.com
>
>then, it looks like the job has ran (I know it has because my file
>that I echo >'d to has the info):
>
>08/26/2005 17:32:18;0010;PBS_Server;Job;16.r111.testcluster.com;Exit_status=0
>resources_used.cput=00:00:00 resources_used.mem=524kb
>resources_used.vmem=7360kb resources_used.walltime=00:01:56
>
>But then I get this:
>
>08/26/2005 17:32:40;000d;PBS_Server;Job;16.r111.testcluster.com;Post
>job file processing error; job 16.r111.testcluster.com on host
>n02/1+n02/0+n01/1+n01/0
>
>I've googled this quite a bit and it seems like this is some kind of
>communication error between the nodes and the headnode.  password-less
>ssh and rsh both work from-to all the machines in the cluster.  So I'm
>not sure what exactly is failing to communiate.
>
>The /home is nfs mounted to the slave nodes so I don't think it's any
>kind of permission error..
>
>If I look at the mom_logs from one of the nodes the job ran on,
>everything looks fine:
>
>08/26/2005 14:30:12;0008;   pbs_mom;Job;17.r111.testcluster.com;JOIN
>JOB as node 1
>08/26/2005 14:30:12;0100;  
>pbs_mom;Job;17.r111.testcluster.com;kill_job received
>08/26/2005 14:30:12;0001;   pbs_mom;Svr;pbs_mom;im_eof, End of File
>from addr 192.168.1.2:1023
>
>This WAS working at one point, then a few people started making
>changes and nobody remembers exactly what.  I've rebuilt toruqe from
>source (1.2.0p5) and the exact same problem persists.
>
>I'm pretty stumped right now, any advice would be wonderfull!
>
>thanks, deek
>
>
>here is the script I'm submitting:
>
>#!/bin/sh
>#PBS -l nodes=2:ppn=2
>#PBS -o /home/trent/output-mytorquejob.log
>#PBS -e /home/trent/output-mytorquejob.err
>### Queue name
>#PBS -q default
>### Job name
>#PBS -N pbs_test
>### Declare job-non-rerunable
>#PBS -r n
>
>echo test>/home/trent/trenttest > /home/trent
>
>
>this is my queue info:
>
>#
># Create queues and set their attributes.
>#
>#
># Create and define queue high
>#
>create queue high
>set queue high queue_type = Execution
>set queue high Priority = 100
>set queue high max_running = 50
>set queue high max_user_run = 50
>#set queue high resources_min.cput = 02:00:00
>#set queue high resources_max.cput = 03:00:00
>#set queue high resources_default.cput = 02:00:00
>set queue high enabled = True
>set queue high started = True
>#
># Create and define queue default
>#
>create queue default
>set queue default queue_type = Execution
>set queue default Priority = 75
>set queue default max_running = 50
>set queue default max_user_run = 50
>#set queue default resources_min.cput = 48:00:00
>#set queue default resources_max.cput = 49:00:00
>#set queue default resources_default.cput = 48:00:00
>set queue default enabled = True
>set queue default started = True
>#
># Create and define queue medium
>#
>create queue medium
>set queue medium queue_type = Execution
>set queue medium Priority = 50
>set queue medium max_running = 50
>set queue medium max_user_run = 50
>#set queue medium resources_min.cput = 168:00:00
>#set queue medium resources_max.cput = 169:00:00
>#set queue medium resources_default.cput = 168:00:00
>set queue medium enabled = True
>set queue medium started = True
>#
># Create and define queue low
>#
>create queue low
>set queue low queue_type = Execution
>set queue low Priority = 25
>set queue low max_running = 50
>set queue low max_user_run = 50
>#set queue low resources_min.cput = 336:00:00
>#set queue low resources_max.cput = 337:00:00
>#set queue low resources_default.cput = 336:00:00
>set queue low enabled = True
>set queue low started = True
>#
># Set server attributes.
>#
>set server scheduling = True
>set server node_pack = True
>set server max_running = 50
>set server max_user_run = 50
>set server acl_host_enable = True
>set server acl_hosts = master.r111.testcluster.com
>set server acl_hosts += *.r111.testcluster.com
>set server acl_hosts += localhost.localdomain
>set server managers = mpiadmin at r111.testcluster.com
>set server managers += mpiadmin at r111.testcluster.com
>set server managers += root at r111.testcluster.com
>set server managers += root at localhost.localdomain
>set server default_queue = default
>set server log_events = 127
>set server mail_from = pbsadmin
>set server query_other_jobs = True
>set server resources_default.neednodes = 1
>set server resources_default.nodect = 1
>set server resources_default.nodes = 1
>set server scheduler_iteration = 150
>set server node_ping_rate = 150
>set server node_check_rate = 300
>set server comment = 'Torque Server @ r111.testcluster.com'
>#
># Define cluster nodes
>#
>create node r111 np=2,ntype=time-shared,properties="headnode"
>create node n01 np=2,ntype=cluster,properties="odd"
>create node n02 np=2,ntype=cluster,properties="even"
>create node n03 np=2,ntype=cluster,properties="odd"
>create node n04 np=2,ntype=cluster,properties="even"
>create node n05 np=2,ntype=cluster,properties="odd"
>create node n06 np=2,ntype=cluster,properties="even"
>create node n07 np=2,ntype=cluster,properties="odd"
>
>_______________________________________________
>Beowulf mailing list, Beowulf at beowulf.org
>To change your subscription (digest mode or unsubscribe) visit http://www.beowulf.org/mailman/listinfo/beowulf
>
>  
>




More information about the Beowulf mailing list