Archives


- Beowulf
- Beowulf Announce
- Scyld-users
- Beowulf on Debian

[Beowulf] torque issues...

Many of your questions may have already been answered in earlier discussions or in the FAQ. The search results page will indicate current discussions as well as past list serves, articles, and papers.

Search

Mike Davis jmdavis at mail2.vcu.edu
Fri Aug 26 16:40:51 PDT 2005


Deek,

It may be trying to scp when the /home dir that you are writing to is 
nfs mounted. There should be a section in the manual about this. I'm at 
home and don't have any docs here.

Mike

decon brody wrote:

>Good day all,
>
>I'm having an issue with (I think) pbs_sched.
>
>I submit a very simple job (script at the end of this mail) all it
>does is, "echo test>/home/trent/trenttest."  I submit it with, "qsub
>pbs_test.sh."  When I watch the queue with qstat -an, I see the job
>enter the status Q, then a second later, it starts to run (R) and
>get's two nodes (like I asked for), then about 2 minutes later, it
>errors (E).
>
>In my script I've sent -o and -e but I never get any output (not my
>biggest concern, but I'm not getting any feedback).  If I look at the
>server_logs, I can see the job being subitted:
>
>08/26/2005 17:30:21;0008;PBS_Server;Job;16.r111.testcluster.com;Job
>Queued at request of trent at r111.testcluster.com, owner =
>trent at r111.testcluster.com, job name = PBS_MPI_Test, queue = default
>08/26/2005 17:30:21;0040;PBS_Server;Svr;r111.testcluster.com;Scheduler
>sent command new
>08/26/2005 17:30:21;0008;PBS_Server;Job;16.r111.testcluster.com;Job
>Modified at request of Scheduler at r111.testcluster.com
>08/26/2005 17:30:21;0008;PBS_Server;Job;16.r111.testcluster.com;Job
>Run at request of Scheduler at r111.testcluster.com
>
>then, it looks like the job has ran (I know it has because my file
>that I echo >'d to has the info):
>
>08/26/2005 17:32:18;0010;PBS_Server;Job;16.r111.testcluster.com;Exit_status=0
>resources_used.cput=00:00:00 resources_used.mem=524kb
>resources_used.vmem=7360kb resources_used.walltime=00:01:56
>
>But then I get this:
>
>08/26/2005 17:32:40;000d;PBS_Server;Job;16.r111.testcluster.com;Post
>job file processing error; job 16.r111.testcluster.com on host
>n02/1+n02/0+n01/1+n01/0
>
>I've googled this quite a bit and it seems like this is some kind of
>communication error between the nodes and the headnode.  password-less
>ssh and rsh both work from-to all the machines in the cluster.  So I'm
>not sure what exactly is failing to communiate.
>
>The /home is nfs mounted to the slave nodes so I don't think it's any
>kind of permission error..
>
>If I look at the mom_logs from one of the nodes the job ran on,
>everything looks fine:
>
>08/26/2005 14:30:12;0008;   pbs_mom;Job;17.r111.testcluster.com;JOIN
>JOB as node 1
>08/26/2005 14:30:12;0100;  
>pbs_mom;Job;17.r111.testcluster.com;kill_job received
>08/26/2005 14:30:12;0001;   pbs_mom;Svr;pbs_mom;im_eof, End of File
>from addr 192.168.1.2:1023
>
>This WAS working at one point, then a few people started making
>changes and nobody remembers exactly what.  I've rebuilt toruqe from
>source (1.2.0p5) and the exact same problem persists.
>
>I'm pretty stumped right now, any advice would be wonderfull!
>
>thanks, deek
>
>
>here is the script I'm submitting:
>
>#!/bin/sh
>#PBS -l nodes=2:ppn=2
>#PBS -o /home/trent/output-mytorquejob.log
>#PBS -e /home/trent/output-mytorquejob.err
>### Queue name
>#PBS -q default
>### Job name
>#PBS -N pbs_test
>### Declare job-non-rerunable
>#PBS -r n
>
>echo test>/home/trent/trenttest > /home/trent
>
>
>this is my queue info:
>
>#
># Create queues and set their attributes.
>#
>#
># Create and define queue high
>#
>create queue high
>set queue high queue_type = Execution
>set queue high Priority = 100
>set queue high max_running = 50
>set queue high max_user_run = 50
>#set queue high resources_min.cput = 02:00:00
>#set queue high resources_max.cput = 03:00:00
>#set queue high resources_default.cput = 02:00:00
>set queue high enabled = True
>set queue high started = True
>#
># Create and define queue default
>#
>create queue default
>set queue default queue_type = Execution
>set queue default Priority = 75
>set queue default max_running = 50
>set queue default max_user_run = 50
>#set queue default resources_min.cput = 48:00:00
>#set queue default resources_max.cput = 49:00:00
>#set queue default resources_default.cput = 48:00:00
>set queue default enabled = True
>set queue default started = True
>#
># Create and define queue medium
>#
>create queue medium
>set queue medium queue_type = Execution
>set queue medium Priority = 50
>set queue medium max_running = 50
>set queue medium max_user_run = 50
>#set queue medium resources_min.cput = 168:00:00
>#set queue medium resources_max.cput = 169:00:00
>#set queue medium resources_default.cput = 168:00:00
>set queue medium enabled = True
>set queue medium started = True
>#
># Create and define queue low
>#
>create queue low
>set queue low queue_type = Execution
>set queue low Priority = 25
>set queue low max_running = 50
>set queue low max_user_run = 50
>#set queue low resources_min.cput = 336:00:00
>#set queue low resources_max.cput = 337:00:00
>#set queue low resources_default.cput = 336:00:00
>set queue low enabled = True
>set queue low started = True
>#
># Set server attributes.
>#
>set server scheduling = True
>set server node_pack = True
>set server max_running = 50
>set server max_user_run = 50
>set server acl_host_enable = True
>set server acl_hosts = master.r111.testcluster.com
>set server acl_hosts += *.r111.testcluster.com
>set server acl_hosts += localhost.localdomain
>set server managers = mpiadmin at r111.testcluster.com
>set server managers += mpiadmin at r111.testcluster.com
>set server managers += root at r111.testcluster.com
>set server managers += root at localhost.localdomain
>set server default_queue = default
>set server log_events = 127
>set server mail_from = pbsadmin
>set server query_other_jobs = True
>set server resources_default.neednodes = 1
>set server resources_default.nodect = 1
>set server resources_default.nodes = 1
>set server scheduler_iteration = 150
>set server node_ping_rate = 150
>set server node_check_rate = 300
>set server comment = 'Torque Server @ r111.testcluster.com'
>#
># Define cluster nodes
>#
>create node r111 np=2,ntype=time-shared,properties="headnode"
>create node n01 np=2,ntype=cluster,properties="odd"
>create node n02 np=2,ntype=cluster,properties="even"
>create node n03 np=2,ntype=cluster,properties="odd"
>create node n04 np=2,ntype=cluster,properties="even"
>create node n05 np=2,ntype=cluster,properties="odd"
>create node n06 np=2,ntype=cluster,properties="even"
>create node n07 np=2,ntype=cluster,properties="odd"
>
>_______________________________________________
>Beowulf mailing list, Beowulf at beowulf.org
>To change your subscription (digest mode or unsubscribe) visit http://www.beowulf.org/mailman/listinfo/beowulf
>
>  
>




More information about the Beowulf mailing list