FEATURE: PATCH:Eygene Ryabinkin - Fitting job into partition that works.
Problem Ref: [MAUIUSERS] PATCH FOR DEFAULT PARTITION HANDLING Organization: Eygene Contact: Eygene Ryabinkin git-svn-id: svn://opensvn.adaptivecomputing.com/maui/trunk@99 3f5042e3-fb1d-0410-be18-d6ca2573e517
This commit is contained in:
parent
479cb2138a
commit
0d182829d2
@ -292,6 +292,7 @@ int MJobSetState(mjob_t *,enum MJobStateEnum);
|
||||
int MJobPreempt(mjob_t *,mjob_t **,enum MPreemptPolicyEnum,char *,int *);
|
||||
int MJobResume(mjob_t *,char *,int *);
|
||||
int MJobGetPAL(mjob_t *,int *,int *,mpar_t **);
|
||||
mpar_t *MJobFindDefPart(mjob_t *, mclass_t *, int *);
|
||||
int MJobRemove(mjob_t *);
|
||||
int MJobGetAccount(mjob_t *,mgcred_t **);
|
||||
int MJobSetCreds(mjob_t *,char *,char *,char *);
|
||||
@ -387,7 +388,7 @@ int MQueueDiagnose(mjob_t **,int *,int,mpar_t *,char *,int);
|
||||
int MQueueCheckStatus(void);
|
||||
int MQueueGetRequeueValue(int *,long,long,double *);
|
||||
int MQueueSelectAllJobs(mjob_t **,int,mpar_t *,int *,int,int,int,char *);
|
||||
int MQueueSelectJobs(int *,int *,int,int,int,unsigned long,int,int *,mbool_t);
|
||||
int MQueueSelectJobs(int *,int *,int,int,int,unsigned long,int,int *,mbool_t,mbool_t);
|
||||
int MQueueAddAJob(mjob_t *);
|
||||
int MQueueRemoveAJob(mjob_t *,int);
|
||||
int MQueueBackFill(int *,int,mpar_t *);
|
||||
|
108
src/moab/MPar.c
108
src/moab/MPar.c
@ -239,52 +239,11 @@ int MJobGetPAL(
|
||||
if (PAL != NULL)
|
||||
MUBMCopy(PAL,tmpPAL,MAX_MPAR);
|
||||
|
||||
/* determine allowed partition default (precedence: U,G,A,C,S,0) */
|
||||
/* determine allowed partition default */
|
||||
|
||||
if (PDef != NULL)
|
||||
{
|
||||
if ((J->Cred.U->F.PDef != NULL) &&
|
||||
(J->Cred.U->F.PDef != &MPar[0]) &&
|
||||
MUBMCheck(((mpar_t *)J->Cred.U->F.PDef)->Index,tmpPAL))
|
||||
{
|
||||
*PDef = (mpar_t *)J->Cred.U->F.PDef;
|
||||
}
|
||||
else if ((J->Cred.G->F.PDef != NULL) &&
|
||||
(J->Cred.G->F.PDef != &MPar[0]) &&
|
||||
MUBMCheck(((mpar_t *)J->Cred.G->F.PDef)->Index,tmpPAL))
|
||||
{
|
||||
*PDef = (mpar_t *)J->Cred.G->F.PDef;
|
||||
}
|
||||
else if ((J->Cred.A != NULL) &&
|
||||
(J->Cred.A->F.PDef != NULL) &&
|
||||
(J->Cred.A->F.PDef != &MPar[0]) &&
|
||||
MUBMCheck(((mpar_t *)J->Cred.A->F.PDef)->Index,tmpPAL))
|
||||
{
|
||||
*PDef = (mpar_t *)J->Cred.A->F.PDef;
|
||||
}
|
||||
else if ((C != NULL) &&
|
||||
(C->F.PDef != NULL) &&
|
||||
(C->F.PDef != &MPar[0]) &&
|
||||
MUBMCheck(((mpar_t *)C->F.PDef)->Index,tmpPAL))
|
||||
{
|
||||
*PDef = (mpar_t *)C->F.PDef;
|
||||
}
|
||||
else if ((J->Cred.Q != NULL) &&
|
||||
(J->Cred.Q->F.PDef != NULL) &&
|
||||
(J->Cred.Q->F.PDef != &MPar[0]) &&
|
||||
MUBMCheck(((mpar_t *)J->Cred.Q->F.PDef)->Index,tmpPAL))
|
||||
{
|
||||
*PDef = (mpar_t *)J->Cred.Q->F.PDef;
|
||||
}
|
||||
else if ((MPar[0].F.PDef != NULL) &&
|
||||
(MPar[0].F.PDef != &MPar[0]))
|
||||
{
|
||||
*PDef = (mpar_t *)MPar[0].F.PDef;
|
||||
}
|
||||
else
|
||||
{
|
||||
*PDef = &MPar[MDEF_SYSPDEF];
|
||||
}
|
||||
*PDef = MJobFindDefPart(J, C, tmpPAL);
|
||||
|
||||
/* verify access to default partition */
|
||||
|
||||
@ -331,7 +290,70 @@ int MJobGetPAL(
|
||||
return(SUCCESS);
|
||||
} /* END MJobGetPAL() */
|
||||
|
||||
/*
|
||||
* Determines default partition for a job (precedence: U,G,A,C,S,0)
|
||||
* 'PAL' is consulted to determine partition access if it is not NULL.
|
||||
* 'C' is consulted for the default partition if it is not NULL.
|
||||
*/
|
||||
mpar_t *MJobFindDefPart(
|
||||
|
||||
mjob_t *J, /* I: job */
|
||||
mclass_t *C, /* I: job class */
|
||||
int *PAL) /* I: partition access list */
|
||||
|
||||
{
|
||||
mpar_t *PDef;
|
||||
|
||||
if ((J->Cred.U->F.PDef != NULL) &&
|
||||
(J->Cred.U->F.PDef != &MPar[0]) &&
|
||||
(PAL == NULL ||
|
||||
MUBMCheck(((mpar_t *)J->Cred.U->F.PDef)->Index,PAL)))
|
||||
{
|
||||
PDef = (mpar_t *)J->Cred.U->F.PDef;
|
||||
}
|
||||
else if ((J->Cred.G->F.PDef != NULL) &&
|
||||
(J->Cred.G->F.PDef != &MPar[0]) &&
|
||||
(PAL == NULL ||
|
||||
MUBMCheck(((mpar_t *)J->Cred.G->F.PDef)->Index,PAL)))
|
||||
{
|
||||
PDef = (mpar_t *)J->Cred.G->F.PDef;
|
||||
}
|
||||
else if ((J->Cred.A != NULL) &&
|
||||
(J->Cred.A->F.PDef != NULL) &&
|
||||
(J->Cred.A->F.PDef != &MPar[0]) &&
|
||||
(PAL == NULL ||
|
||||
MUBMCheck(((mpar_t *)J->Cred.A->F.PDef)->Index,PAL)))
|
||||
{
|
||||
PDef = (mpar_t *)J->Cred.A->F.PDef;
|
||||
}
|
||||
else if ((C != NULL) &&
|
||||
(C->F.PDef != NULL) &&
|
||||
(C->F.PDef != &MPar[0]) &&
|
||||
(PAL == NULL ||
|
||||
MUBMCheck(((mpar_t *)C->F.PDef)->Index,PAL)))
|
||||
{
|
||||
PDef = (mpar_t *)C->F.PDef;
|
||||
}
|
||||
else if ((J->Cred.Q != NULL) &&
|
||||
(J->Cred.Q->F.PDef != NULL) &&
|
||||
(J->Cred.Q->F.PDef != &MPar[0]) &&
|
||||
(PAL == NULL ||
|
||||
MUBMCheck(((mpar_t *)J->Cred.Q->F.PDef)->Index,PAL)))
|
||||
{
|
||||
PDef = (mpar_t *)J->Cred.Q->F.PDef;
|
||||
}
|
||||
else if ((MPar[0].F.PDef != NULL) &&
|
||||
(MPar[0].F.PDef != &MPar[0]))
|
||||
{
|
||||
PDef = (mpar_t *)MPar[0].F.PDef;
|
||||
}
|
||||
else
|
||||
{
|
||||
PDef = &MPar[MDEF_SYSPDEF];
|
||||
}
|
||||
|
||||
return PDef;
|
||||
} /* END MJobFindDefPart() */
|
||||
|
||||
|
||||
int MParFind(
|
||||
|
@ -39,11 +39,21 @@ extern mres_t *MRes[];
|
||||
|
||||
*/
|
||||
|
||||
static int MQueueCheckSingleJob(
|
||||
|
||||
mjob_t *J,
|
||||
int *Reason,
|
||||
mpar_t *P,
|
||||
mpar_t *GP,
|
||||
int PLevel,
|
||||
int MaxNC,
|
||||
int MaxPC,
|
||||
unsigned long MaxWCLimit,
|
||||
int OrigPIndex,
|
||||
mbool_t UpdateStats);
|
||||
|
||||
/* NYI: must handle effqduration */
|
||||
|
||||
|
||||
|
||||
|
||||
int MQueueSelectJobs(
|
||||
|
||||
int *SrcQ, /* I */
|
||||
@ -54,7 +64,8 @@ int MQueueSelectJobs(
|
||||
unsigned long MaxWCLimit, /* I */
|
||||
int OrigPIndex, /* I */
|
||||
int *FReason, /* O */
|
||||
mbool_t UpdateStats) /* I: (boolean) */
|
||||
mbool_t UpdateStats, /* I: (boolean) */
|
||||
mbool_t OnlyDefPart) /* I: (boolean) */
|
||||
|
||||
{
|
||||
int index;
|
||||
@ -63,27 +74,14 @@ int MQueueSelectJobs(
|
||||
|
||||
mjob_t *J;
|
||||
|
||||
char DValue[MAX_MNAME];
|
||||
enum MJobDependEnum DType;
|
||||
|
||||
mpar_t *P;
|
||||
mpar_t *GP;
|
||||
|
||||
long PS;
|
||||
|
||||
int LReason[MAX_MREJREASON];
|
||||
int PReason;
|
||||
|
||||
int *Reason;
|
||||
|
||||
int PIndex;
|
||||
int PReq;
|
||||
|
||||
mreq_t *RQ;
|
||||
|
||||
double PE;
|
||||
|
||||
char tmpLine[MAX_MLINE];
|
||||
|
||||
const char *FName = "MQueueSelectJobs";
|
||||
|
||||
@ -159,324 +157,28 @@ int MQueueSelectJobs(
|
||||
continue;
|
||||
}
|
||||
|
||||
RQ = J->Req[0]; /* FIXME */
|
||||
|
||||
/* if job removed */
|
||||
|
||||
if (J->Name[0] == '\0')
|
||||
if (OnlyDefPart == TRUE && MJobFindDefPart(J, NULL, NULL) != P)
|
||||
{
|
||||
Reason[marCorruption]++;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (UpdateStats == TRUE)
|
||||
{
|
||||
J->BlockReason = 0;
|
||||
|
||||
if (J->State == mjsIdle)
|
||||
MStat.IdleJobs++;
|
||||
}
|
||||
|
||||
PReq = MJobGetProcCount(J);
|
||||
MJobGetPE(J,P,&PE);
|
||||
PS = (long)PReq * J->SpecWCLimit[0];
|
||||
|
||||
/* check partition */
|
||||
|
||||
if (OrigPIndex != -1)
|
||||
{
|
||||
if ((P->Index == 0) && !(J->Flags & (1 << mjfSpan)))
|
||||
{
|
||||
/* why? what does partition '0' mean in partition mode? */
|
||||
|
||||
DBG(3,fSCHED) DPrint("INFO: job %s not considered for spanning\n",
|
||||
J->Name);
|
||||
|
||||
Reason[marPartitionAccess]++;
|
||||
|
||||
continue;
|
||||
}
|
||||
else if ((P->Index != 0) && (J->Flags & (1 << mjfSpan)))
|
||||
{
|
||||
DBG(3,fSCHED) DPrint("INFO: spanning job %s not considered for partition scheduling\n",
|
||||
J->Name);
|
||||
|
||||
Reason[marPartitionAccess]++;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((P->Index > 0) && (MUBMCheck(P->Index,J->PAL) == FAILURE))
|
||||
{
|
||||
DBG(7,fSCHED) DPrint("INFO: job %s not considered for partition %s (allowed %s)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
MUListAttrs(ePartition,J->PAL[0]));
|
||||
|
||||
Reason[marPartitionAccess]++;
|
||||
|
||||
continue;
|
||||
}
|
||||
} /* END if (OrigPIndex != -1) */
|
||||
|
||||
/* check job state */
|
||||
|
||||
if ((J->State != mjsIdle) && (J->State != mjsSuspended))
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected (job in non-idle state '%s')\n",
|
||||
DBG(7,fSCHED) DPrint("INFO: skipping job[%d] '%s', only default partition check requested (and current partition is %s)\n",
|
||||
jindex,
|
||||
J->Name,
|
||||
MJobState[J->State]);
|
||||
|
||||
Reason[marState]++;
|
||||
|
||||
if ((MaxNC == MAX_MNODE) &&
|
||||
(MaxWCLimit == MAX_MTIME) &&
|
||||
(J->R != NULL))
|
||||
{
|
||||
if ((J->State != mjsStarting) && (J->State != mjsRunning))
|
||||
MResDestroy(&J->R);
|
||||
}
|
||||
P->Name);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check if job has been previously scheduled or deferred */
|
||||
|
||||
if ((J->EState != mjsIdle) && (J->EState != mjsSuspended))
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected (job in non-idle expected state: '%s')\n",
|
||||
J->Name,
|
||||
MJobState[J->EState]);
|
||||
|
||||
Reason[marEState]++;
|
||||
|
||||
if ((MaxNC == MAX_MNODE) && (MaxWCLimit == MAX_MTIME) && (J->R != NULL))
|
||||
{
|
||||
if ((J->EState != mjsStarting) && (J->EState != mjsRunning))
|
||||
MResDestroy(&J->R);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check available procs */
|
||||
|
||||
if (PReq > P->CRes.Procs)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected in partition %s (exceeds configured procs: %d > %d)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
PReq,
|
||||
P->CRes.Procs);
|
||||
|
||||
Reason[marNodeCount]++;
|
||||
|
||||
if (P->Index <= 0)
|
||||
{
|
||||
if (J->R != NULL)
|
||||
MResDestroy(&J->R);
|
||||
|
||||
if (J->Hold == 0)
|
||||
{
|
||||
MJobSetHold(
|
||||
J,
|
||||
(1 << mhDefer),
|
||||
MSched.DeferTime,
|
||||
mhrNoResources,
|
||||
"exceeds partition configured procs");
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check partition specific limits */
|
||||
|
||||
if (MJobCheckLimits(
|
||||
J,
|
||||
if (MQueueCheckSingleJob(
|
||||
J,
|
||||
Reason,
|
||||
P,
|
||||
GP,
|
||||
PLevel,
|
||||
P,
|
||||
(1 << mlSystem),
|
||||
tmpLine) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected, partition %s (%s)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
tmpLine);
|
||||
|
||||
Reason[marSystemLimits]++;
|
||||
|
||||
if (P->Index <= 0)
|
||||
{
|
||||
if (J->R != NULL)
|
||||
MResDestroy(&J->R);
|
||||
|
||||
MJobSetHold(
|
||||
J,
|
||||
(1 << mhDefer),
|
||||
MSched.DeferTime,
|
||||
mhrSystemLimits,
|
||||
"exceeds system proc/job limit");
|
||||
}
|
||||
|
||||
MaxNC,
|
||||
MaxPC,
|
||||
MaxWCLimit,
|
||||
OrigPIndex,
|
||||
UpdateStats) == FAILURE)
|
||||
continue;
|
||||
} /* END if (MJobCheckLimits() == FAILURE) */
|
||||
|
||||
/* check job size */
|
||||
|
||||
if (PReq > MaxPC)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected in partition %s (exceeds window size: %d > %d)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
PReq,
|
||||
MaxPC);
|
||||
|
||||
Reason[marNodeCount]++;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check job duration */
|
||||
|
||||
if (J->SpecWCLimit[0] > MaxWCLimit)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected in partition %s (exceeds window time: %ld > %ld)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
J->SpecWCLimit[0],
|
||||
MaxWCLimit);
|
||||
|
||||
Reason[marTime]++;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check partition class support */
|
||||
|
||||
if (P->Index > 0)
|
||||
{
|
||||
if (MUNumListGetCount(J->StartPriority,RQ->DRes.PSlot,P->CRes.PSlot,0,NULL) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected, partition %s (classes not supported '%s')\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
MUCAListToString(RQ->DRes.PSlot,P->CRes.PSlot,NULL));
|
||||
|
||||
Reason[marClass]++;
|
||||
|
||||
if (J->R != NULL)
|
||||
MResDestroy(&J->R);
|
||||
|
||||
continue;
|
||||
}
|
||||
} /* END if (PIndex) */
|
||||
|
||||
if (MJobCheckDependency(J,&DType,DValue) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected (dependent on job '%s' %s)\n",
|
||||
J->Name,
|
||||
DValue,
|
||||
MJobDependType[DType]);
|
||||
|
||||
if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
|
||||
{
|
||||
J->SystemQueueTime = MSched.Time;
|
||||
}
|
||||
|
||||
Reason[marDepend]++;
|
||||
|
||||
if ((MaxNC == MAX_MNODE) &&
|
||||
(MaxWCLimit == MAX_MTIME) &&
|
||||
(J->R != NULL))
|
||||
{
|
||||
MResDestroy(&J->R);
|
||||
}
|
||||
|
||||
continue;
|
||||
} /* END if (MJobCheckDependency(J,&JDepend) == FAILURE) */
|
||||
|
||||
/* check partition active job policies */
|
||||
|
||||
if (MJobCheckPolicies(
|
||||
J,
|
||||
PLevel,
|
||||
(1 << mlActive),
|
||||
P, /* NOTE: may set to &MPar[0] */
|
||||
&PReason,
|
||||
NULL,
|
||||
MAX_MTIME) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected, partition %s (policy failure: '%s')\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
MPolicyRejection[PReason]);
|
||||
|
||||
if (PLevel == ptHARD)
|
||||
{
|
||||
if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
|
||||
{
|
||||
J->SystemQueueTime = MSched.Time;
|
||||
}
|
||||
}
|
||||
|
||||
Reason[marPolicy]++;
|
||||
|
||||
if ((MaxNC == MAX_MNODE) &&
|
||||
(MaxWCLimit == MAX_MTIME) &&
|
||||
(J->R != NULL))
|
||||
{
|
||||
MResDestroy(&J->R);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
J->Cred.U->MTime = MSched.Time;
|
||||
J->Cred.G->MTime = MSched.Time;
|
||||
|
||||
if (J->Cred.A != NULL)
|
||||
J->Cred.A->MTime = MSched.Time;
|
||||
|
||||
if (MPar[0].FSC.FSPolicy != fspNONE)
|
||||
{
|
||||
int OIndex;
|
||||
|
||||
if (MFSCheckCap(NULL,J,P,&OIndex) == FAILURE)
|
||||
{
|
||||
DBG(5,fSCHED) DPrint("INFO: job '%s' exceeds %s FS cap\n",
|
||||
J->Name,
|
||||
(OIndex > 0) ? MXO[OIndex] : "NONE");
|
||||
|
||||
if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
|
||||
{
|
||||
J->SystemQueueTime = MSched.Time;
|
||||
}
|
||||
|
||||
Reason[marFairShare]++;
|
||||
|
||||
continue;
|
||||
}
|
||||
} /* END if (FS[0].FSPolicy != fspNONE) */
|
||||
|
||||
/* NOTE: idle queue policies handled in MQueueSelectAllJobs() */
|
||||
|
||||
if (MLocalCheckFairnessPolicy(J,MSched.Time,NULL) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected, partition %s (violates local fairness policy)\n",
|
||||
J->Name,
|
||||
P->Name);
|
||||
|
||||
if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
|
||||
{
|
||||
J->SystemQueueTime = MSched.Time;
|
||||
}
|
||||
|
||||
Reason[marPolicy]++;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
/* NOTE: effective queue duration not yet properly supported */
|
||||
|
||||
@ -522,6 +224,363 @@ int MQueueSelectJobs(
|
||||
return(SUCCESS);
|
||||
} /* END MQueueSelectJobs() */
|
||||
|
||||
/*
|
||||
* Helper for MQueueSelectJobs: performs the single job evaluation.
|
||||
* Returns SUCCESS if job can be queued and FAILURE otherwise.
|
||||
*/
|
||||
static int MQueueCheckSingleJob(
|
||||
mjob_t *J,
|
||||
int *Reason,
|
||||
mpar_t *P,
|
||||
mpar_t *GP,
|
||||
int PLevel,
|
||||
int MaxNC,
|
||||
int MaxPC,
|
||||
unsigned long MaxWCLimit,
|
||||
int OrigPIndex,
|
||||
mbool_t UpdateStats)
|
||||
|
||||
{
|
||||
char DValue[MAX_MNAME];
|
||||
enum MJobDependEnum DType;
|
||||
|
||||
long PS;
|
||||
|
||||
int PReason;
|
||||
|
||||
int PReq;
|
||||
|
||||
mreq_t *RQ;
|
||||
|
||||
double PE;
|
||||
|
||||
char tmpLine[MAX_MLINE];
|
||||
|
||||
const char *FName = "MQueueCheckSingleJob";
|
||||
|
||||
RQ = J->Req[0]; /* FIXME */
|
||||
|
||||
/* if job removed */
|
||||
|
||||
if (J->Name[0] == '\0')
|
||||
{
|
||||
Reason[marCorruption]++;
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
|
||||
if (UpdateStats == TRUE)
|
||||
{
|
||||
J->BlockReason = 0;
|
||||
|
||||
if (J->State == mjsIdle)
|
||||
MStat.IdleJobs++;
|
||||
}
|
||||
|
||||
PReq = MJobGetProcCount(J);
|
||||
/* XXX: PE is unused? */
|
||||
MJobGetPE(J,P,&PE);
|
||||
PS = (long)PReq * J->SpecWCLimit[0];
|
||||
|
||||
/* check partition */
|
||||
|
||||
if (OrigPIndex != -1)
|
||||
{
|
||||
if ((P->Index == 0) && !(J->Flags & (1 << mjfSpan)))
|
||||
{
|
||||
/* why? what does partition '0' mean in partition mode? */
|
||||
|
||||
DBG(3,fSCHED) DPrint("INFO: job %s not considered for spanning\n",
|
||||
J->Name);
|
||||
|
||||
Reason[marPartitionAccess]++;
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
else if ((P->Index != 0) && (J->Flags & (1 << mjfSpan)))
|
||||
{
|
||||
DBG(3,fSCHED) DPrint("INFO: spanning job %s not considered for partition scheduling\n",
|
||||
J->Name);
|
||||
|
||||
Reason[marPartitionAccess]++;
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
|
||||
if ((P->Index > 0) && (MUBMCheck(P->Index,J->PAL) == FAILURE))
|
||||
{
|
||||
DBG(7,fSCHED) DPrint("INFO: job %s not considered for partition %s (allowed %s)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
MUListAttrs(ePartition,J->PAL[0]));
|
||||
|
||||
Reason[marPartitionAccess]++;
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
} /* END if (OrigPIndex != -1) */
|
||||
|
||||
/* check job state */
|
||||
|
||||
if ((J->State != mjsIdle) && (J->State != mjsSuspended))
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected (job in non-idle state '%s')\n",
|
||||
J->Name,
|
||||
MJobState[J->State]);
|
||||
|
||||
Reason[marState]++;
|
||||
|
||||
if ((MaxNC == MAX_MNODE) &&
|
||||
(MaxWCLimit == MAX_MTIME) &&
|
||||
(J->R != NULL))
|
||||
{
|
||||
if ((J->State != mjsStarting) && (J->State != mjsRunning))
|
||||
MResDestroy(&J->R);
|
||||
}
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
|
||||
/* check if job has been previously scheduled or deferred */
|
||||
|
||||
if ((J->EState != mjsIdle) && (J->EState != mjsSuspended))
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected (job in non-idle expected state: '%s')\n",
|
||||
J->Name,
|
||||
MJobState[J->EState]);
|
||||
|
||||
Reason[marEState]++;
|
||||
|
||||
if ((MaxNC == MAX_MNODE) && (MaxWCLimit == MAX_MTIME) && (J->R != NULL))
|
||||
{
|
||||
if ((J->EState != mjsStarting) && (J->EState != mjsRunning))
|
||||
MResDestroy(&J->R);
|
||||
}
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
|
||||
/* check available procs */
|
||||
|
||||
if (PReq > P->CRes.Procs)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected in partition %s (exceeds configured procs: %d > %d)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
PReq,
|
||||
P->CRes.Procs);
|
||||
|
||||
Reason[marNodeCount]++;
|
||||
|
||||
if (P->Index <= 0)
|
||||
{
|
||||
if (J->R != NULL)
|
||||
MResDestroy(&J->R);
|
||||
|
||||
if (J->Hold == 0)
|
||||
{
|
||||
MJobSetHold(
|
||||
J,
|
||||
(1 << mhDefer),
|
||||
MSched.DeferTime,
|
||||
mhrNoResources,
|
||||
"exceeds partition configured procs");
|
||||
}
|
||||
}
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
|
||||
/* check partition specific limits */
|
||||
|
||||
if (MJobCheckLimits(
|
||||
J,
|
||||
PLevel,
|
||||
P,
|
||||
(1 << mlSystem),
|
||||
tmpLine) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected, partition %s (%s)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
tmpLine);
|
||||
|
||||
Reason[marSystemLimits]++;
|
||||
|
||||
if (P->Index <= 0)
|
||||
{
|
||||
if (J->R != NULL)
|
||||
MResDestroy(&J->R);
|
||||
|
||||
MJobSetHold(
|
||||
J,
|
||||
(1 << mhDefer),
|
||||
MSched.DeferTime,
|
||||
mhrSystemLimits,
|
||||
"exceeds system proc/job limit");
|
||||
}
|
||||
|
||||
return(FAILURE);
|
||||
} /* END if (MJobCheckLimits() == FAILURE) */
|
||||
|
||||
/* check job size */
|
||||
|
||||
if (PReq > MaxPC)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected in partition %s (exceeds window size: %d > %d)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
PReq,
|
||||
MaxPC);
|
||||
|
||||
Reason[marNodeCount]++;
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
|
||||
/* check job duration */
|
||||
|
||||
if (J->SpecWCLimit[0] > MaxWCLimit)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected in partition %s (exceeds window time: %ld > %ld)\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
J->SpecWCLimit[0],
|
||||
MaxWCLimit);
|
||||
|
||||
Reason[marTime]++;
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
|
||||
/* check partition class support */
|
||||
|
||||
if (P->Index > 0)
|
||||
{
|
||||
if (MUNumListGetCount(J->StartPriority,RQ->DRes.PSlot,P->CRes.PSlot,0,NULL) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected, partition %s (classes not supported '%s')\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
MUCAListToString(RQ->DRes.PSlot,P->CRes.PSlot,NULL));
|
||||
|
||||
Reason[marClass]++;
|
||||
|
||||
if (J->R != NULL)
|
||||
MResDestroy(&J->R);
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
} /* END if (PIndex) */
|
||||
|
||||
if (MJobCheckDependency(J,&DType,DValue) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected (dependent on job '%s' %s)\n",
|
||||
J->Name,
|
||||
DValue,
|
||||
MJobDependType[DType]);
|
||||
|
||||
if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
|
||||
{
|
||||
J->SystemQueueTime = MSched.Time;
|
||||
}
|
||||
|
||||
Reason[marDepend]++;
|
||||
|
||||
if ((MaxNC == MAX_MNODE) &&
|
||||
(MaxWCLimit == MAX_MTIME) &&
|
||||
(J->R != NULL))
|
||||
{
|
||||
MResDestroy(&J->R);
|
||||
}
|
||||
|
||||
return(FAILURE);
|
||||
} /* END if (MJobCheckDependency(J,&JDepend) == FAILURE) */
|
||||
|
||||
/* check partition active job policies */
|
||||
|
||||
if (MJobCheckPolicies(
|
||||
J,
|
||||
PLevel,
|
||||
(1 << mlActive),
|
||||
P, /* NOTE: may set to &MPar[0] */
|
||||
&PReason,
|
||||
NULL,
|
||||
MAX_MTIME) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected, partition %s (policy failure: '%s')\n",
|
||||
J->Name,
|
||||
P->Name,
|
||||
MPolicyRejection[PReason]);
|
||||
|
||||
if (PLevel == ptHARD)
|
||||
{
|
||||
if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
|
||||
{
|
||||
J->SystemQueueTime = MSched.Time;
|
||||
}
|
||||
}
|
||||
|
||||
Reason[marPolicy]++;
|
||||
|
||||
if ((MaxNC == MAX_MNODE) &&
|
||||
(MaxWCLimit == MAX_MTIME) &&
|
||||
(J->R != NULL))
|
||||
{
|
||||
MResDestroy(&J->R);
|
||||
}
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
|
||||
J->Cred.U->MTime = MSched.Time;
|
||||
J->Cred.G->MTime = MSched.Time;
|
||||
|
||||
if (J->Cred.A != NULL)
|
||||
J->Cred.A->MTime = MSched.Time;
|
||||
|
||||
if (MPar[0].FSC.FSPolicy != fspNONE)
|
||||
{
|
||||
int OIndex;
|
||||
|
||||
if (MFSCheckCap(NULL,J,P,&OIndex) == FAILURE)
|
||||
{
|
||||
DBG(5,fSCHED) DPrint("INFO: job '%s' exceeds %s FS cap\n",
|
||||
J->Name,
|
||||
(OIndex > 0) ? MXO[OIndex] : "NONE");
|
||||
|
||||
if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
|
||||
{
|
||||
J->SystemQueueTime = MSched.Time;
|
||||
}
|
||||
|
||||
Reason[marFairShare]++;
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
} /* END if (FS[0].FSPolicy != fspNONE) */
|
||||
|
||||
/* NOTE: idle queue policies handled in MQueueSelectAllJobs() */
|
||||
|
||||
if (MLocalCheckFairnessPolicy(J,MSched.Time,NULL) == FAILURE)
|
||||
{
|
||||
DBG(6,fSCHED) DPrint("INFO: job %s rejected, partition %s (violates local fairness policy)\n",
|
||||
J->Name,
|
||||
P->Name);
|
||||
|
||||
if (GP->JobPrioAccrualPolicy == jpapFullPolicy)
|
||||
{
|
||||
J->SystemQueueTime = MSched.Time;
|
||||
}
|
||||
|
||||
Reason[marPolicy]++;
|
||||
|
||||
return(FAILURE);
|
||||
}
|
||||
|
||||
return(SUCCESS);
|
||||
} /* END MQueueCheckSingleJob() */
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -338,6 +338,7 @@ int MQueueBackFill(
|
||||
AdjBFTime,
|
||||
P->Index,
|
||||
NULL,
|
||||
FALSE,
|
||||
FALSE) == FAILURE)
|
||||
{
|
||||
DBG(5,fSCHED) DPrint("INFO: no jobs meet BF window criteria in partition %s\n",
|
||||
@ -1408,6 +1409,7 @@ int MQueueCheckStatus()
|
||||
MAX_MTIME,
|
||||
-1,
|
||||
ReasonList,
|
||||
FALSE,
|
||||
FALSE) == FAILURE)
|
||||
{
|
||||
strcpy(DeferMessage,"SCHED_INFO: job cannot run. Reason: cannot select job\n");
|
||||
|
@ -6737,6 +6737,52 @@ int MJobDistributeTasks(
|
||||
} /* END MJobDistributeTasks() */
|
||||
|
||||
|
||||
/* Helper routine for MSchedProcessJobs() */
|
||||
static void m_schedule_on_partitions(
|
||||
|
||||
int OnlyDefPart, /* I */
|
||||
int DoBackfill, /* I */
|
||||
int *CurrentQ) /* I */
|
||||
|
||||
{
|
||||
int PIndex;
|
||||
int tmpQ[MAX_MJOB];
|
||||
|
||||
for (PIndex = 0;PIndex < MAX_MPAR;PIndex++)
|
||||
{
|
||||
if (((PIndex == 0) && (MPar[2].ConfigNodes == 0)) ||
|
||||
(MPar[PIndex].ConfigNodes == 0))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
MOQueueInitialize(tmpQ);
|
||||
|
||||
if (MQueueSelectJobs(
|
||||
CurrentQ,
|
||||
tmpQ,
|
||||
ptSOFT,
|
||||
MAX_MNODE,
|
||||
MAX_MTASK,
|
||||
MAX_MTIME,
|
||||
PIndex,
|
||||
NULL,
|
||||
TRUE,
|
||||
OnlyDefPart) == SUCCESS)
|
||||
{
|
||||
MQueueScheduleIJobs(tmpQ,&MPar[PIndex]);
|
||||
|
||||
if (DoBackfill == TRUE && MPar[PIndex].BFPolicy != ptOFF)
|
||||
{
|
||||
/* backfill jobs using 'soft' policy constraints */
|
||||
|
||||
MQueueBackFill(tmpQ,ptSOFT,&MPar[PIndex]);
|
||||
}
|
||||
}
|
||||
|
||||
MOQueueDestroy(tmpQ,FALSE);
|
||||
} /* END for (PIndex) */ \
|
||||
} /* END m_schedule_on_partitions() */
|
||||
|
||||
|
||||
int MSchedProcessJobs(
|
||||
@ -6841,6 +6887,7 @@ int MSchedProcessJobs(
|
||||
MAX_MTIME,
|
||||
-1,
|
||||
NULL,
|
||||
FALSE,
|
||||
FALSE) == SUCCESS)
|
||||
{
|
||||
memcpy(MFQ,tmpQ,sizeof(MFQ));
|
||||
@ -6863,45 +6910,20 @@ int MSchedProcessJobs(
|
||||
MAX_MTIME,
|
||||
-1,
|
||||
NULL,
|
||||
TRUE);
|
||||
TRUE,
|
||||
FALSE);
|
||||
|
||||
/* schedule priority jobs */
|
||||
|
||||
if (CurrentQ[0] != -1)
|
||||
{
|
||||
for (PIndex = 0;PIndex < MAX_MPAR;PIndex++)
|
||||
{
|
||||
if (((PIndex == 0) && (MPar[2].ConfigNodes == 0)) ||
|
||||
(MPar[PIndex].ConfigNodes == 0))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
/* schedule jobs on their default partitions; skip backfilling */
|
||||
|
||||
MOQueueInitialize(tmpQ);
|
||||
m_schedule_on_partitions(TRUE, FALSE, CurrentQ);
|
||||
|
||||
if (MQueueSelectJobs(
|
||||
CurrentQ,
|
||||
tmpQ,
|
||||
ptSOFT,
|
||||
MAX_MNODE,
|
||||
MAX_MTASK,
|
||||
MAX_MTIME,
|
||||
PIndex,
|
||||
NULL,
|
||||
TRUE) == SUCCESS)
|
||||
{
|
||||
MQueueScheduleIJobs(tmpQ,&MPar[PIndex]);
|
||||
/* schedule jobs on all partitions; do backfilling */
|
||||
|
||||
if (MPar[PIndex].BFPolicy != ptOFF)
|
||||
{
|
||||
/* backfill jobs using 'soft' policy constraints */
|
||||
|
||||
MQueueBackFill(tmpQ,ptSOFT,&MPar[PIndex]);
|
||||
}
|
||||
}
|
||||
|
||||
MOQueueDestroy(tmpQ,FALSE);
|
||||
} /* END for (PIndex) */
|
||||
m_schedule_on_partitions(FALSE, TRUE, CurrentQ);
|
||||
} /* END if (GlobalSQ[0] != -1) */
|
||||
|
||||
MOQueueDestroy(CurrentQ,TRUE);
|
||||
@ -6915,7 +6937,8 @@ int MSchedProcessJobs(
|
||||
MAX_MTIME,
|
||||
-1,
|
||||
NULL,
|
||||
TRUE);
|
||||
TRUE,
|
||||
FALSE);
|
||||
|
||||
if (CurrentQ[0] != -1)
|
||||
{
|
||||
@ -6947,7 +6970,8 @@ int MSchedProcessJobs(
|
||||
MAX_MTIME,
|
||||
PIndex,
|
||||
NULL,
|
||||
TRUE) == SUCCESS)
|
||||
TRUE,
|
||||
FALSE) == SUCCESS)
|
||||
{
|
||||
MQueueBackFill(tmpQ,ptHARD,&MPar[PIndex]);
|
||||
}
|
||||
@ -6989,7 +7013,8 @@ int MSchedProcessJobs(
|
||||
MAX_MTIME,
|
||||
-1,
|
||||
NULL,
|
||||
TRUE);
|
||||
TRUE,
|
||||
FALSE);
|
||||
|
||||
/* must sort/order MUIQ */
|
||||
|
||||
|
@ -1775,6 +1775,7 @@ int UIJobShow(
|
||||
MAX_MTIME,
|
||||
P->Index,
|
||||
Reason,
|
||||
FALSE,
|
||||
FALSE) == FAILURE) || (DstQ[0] == -1))
|
||||
{
|
||||
for (index = 0;index < MAX_MREJREASON;index++)
|
||||
|
Loading…
Reference in New Issue
Block a user