Commit 567f523d authored by akhuziy's avatar akhuziy
Browse files

edited the rules for rcm

parent 6a3d07a4
......@@ -29,10 +29,15 @@ class Attributes:
cpu_usage = 0
cpu_usage_sum = 0
alloc_cu_sum = 0
cpu_requested = Aggr.job.requested_cu
for node in Aggr.nodes:
thr_cnt = node.virt_thr_core + node.phys_thr_core
cpu_usage_sum += node.proc.cpu_usage
alloc_cu_sum += node.alloc_cu / thr_cnt
cpu_usage = cpu_usage_sum / cpu_requested
......@@ -40,10 +45,10 @@ class Attributes:
format(cpu_usage, cpu_usage_sum, cpu_requested))
res = "NORM"
if cpu_usage < 50 and node.alloc_cu <= 8:
if cpu_usage < 50 and alloc_cu_sum <= 8:
res = "LOW"
break
elif cpu_usage < 80 and node.alloc_cu > 8:
elif cpu_usage < 80 and alloc_cu_sum > 8:
res = "LOW"
break
......@@ -78,36 +83,54 @@ class Attributes:
@rcmattr
def node_cpu_usage_max(Aggr):
res = "NORM"
res = ["ZERO", "LOW", "NORM", "HIGH"]
max_res = res.index("ZERO")
for node in Aggr.nodes:
node_proc_ratio = node.proc.cpu_usage / node.alloc_cu
if node_proc_ratio < 50 and node.alloc_cu <= 8:
res = "LOW"
break
thr_cnt = node.virt_thr_core + node.phys_thr_core
node_proc_ratio = node.proc.cpu_usage / (node.alloc_cu / thr_cnt)
node_res = 0
if node_proc_ratio == 0:
node_res = res.index("ZERO")
elif node_proc_ratio > 100:
nodre_res = res.index("HIGH")
elif node_proc_ratio < 50 and node.alloc_cu <= 8:
node_res = res.index("LOW")
elif node_proc_ratio < 80 and node.alloc_cu > 8:
res = "LOW"
break
node_res = res.index("LOW")
elif node_proc_ratio > 50 and node.alloc_cu <= 8:
node_res = res.index("NORM")
elif node_proc_ratio > 80 and node.alloc_cu > 8:
node_res = res.index("NORM")
return res
max_res = max(node_res, max_res)
@rcmattr
def min_proc_ratio(Aggr):
min_proc_ratio = 1
return res[max_res]
def node_cpu_usage_min(Aggr):
res = ["ZERO", "LOW", "NORM", "HIGH"]
min_res = res.index("HIGH")
for node in Aggr.nodes:
node_proc_ratio = (node.proc.cpu_usage / 100) / node.alloc_cu
min_proc_ratio = min(min_proc_ratio, node_proc_ratio)
thr_cnt = node.virt_thr_core + node.phys_thr_core
node_proc_ratio = node.proc.cpu_usage / (node.alloc_cu / thr_cnt)
node_res = 0
if node_proc_ratio == 0:
node_res = res.index("ZERO")
elif node_proc_ratio > 100:
nodre_res = res.index("HIGH")
elif node_proc_ratio < 50 and node.alloc_cu <= 8:
node_res = res.index("LOW")
elif node_proc_ratio < 80 and node.alloc_cu > 8:
node_res = res.index("LOW")
elif node_proc_ratio > 50 and node.alloc_cu <= 8:
node_res = res.index("NORM")
elif node_proc_ratio > 80 and node.alloc_cu > 8:
node_res = res.index("NORM")
printd("min ratio = {}".format(min_proc_ratio))
res = "MED"
if min_proc_ratio > 1.1:
res = "HIGH"
elif min_proc_ratio == 0:
res = "ZERO"
elif min_proc_ratio < 0.9:
res = "LOW"
min_res = min(node_res, max_res)
return res
return res[min_res]
@rcmattr
def req_walltime(Aggr):
......
......@@ -33,23 +33,65 @@ The attribute for the requested walltime.
## node_cpu_usage_max
The attribute for the maximum `cpu_usage`.
The attribute for the maximum `cpu_usage` among the nodes.
The optimal CPU usages depends on the amount of cores requested. When the number of requested cores is not very high, `cpu_usage` should not be near 100%, since the program might be not CPU bound (as soon as we can check it we can make more elaborate attribute). But when the amount of requested CPUs get higher, they should be used more efficiently.
**Values**:
`HIGH` - The maximum `cpu_usage` is greater than 100%
`NORM` - The maximum `cpu_usage` is in the tolerable range
`LOW` - The maximum `cpu_usage` is lower than expected
`ZERO` - The maximum `cpu_usage` is 0
**Calculation**:
`U` - maximum sum of `cpu_usage` of all processes on a single node, divided by the number of requested cores.
`U` - the sum of `cpu_usage` of all processes on a single node, divided by the number of requested cores.
`R` - requested amount of cores on a single node.
`LOW` = `U < 50` & `R <= 8` | `U < 80` & `R > 8`
`HIGH` = if such node exists, that `U > 100`
`NORM` = !`HIGH` & if such node exists, that `U > 50` & `R <= 8` | `U > 80` & `R > 8` holds
`LOW` = !`NORM` & if such node exists, that `U < 50` & `R <= 8` | `U < 80` & `R > 8` holds
`ZERO` = !`LOW` & !`NORM` & !`HIGH`
![node_cpu_usage_max](img/node_cpu_usage_max.svg)
*The `LOW` values are highlighted*
## node_cpu_usage_min
The attribute for the minimum `cpu_usage` among the nodes.
Similar to the attribute `node_cpu_usage_max` but indicates the minimum of `cpu_usage` on the node.
**Values**:
`HIGH` - The minimum `cpu_usage` is greater than 100%
`NORM` - The minimum `cpu_usage` is in the tolerable range
`LOW` - The minimum `cpu_usage` is lower than expected
`ZERO` - The minimum `cpu_usage` is 0
**Calculation**:
`U` - the sum of `cpu_usage` of all processes on a single node, divided by the number of requested cores.
`R` - requested amount of cores on a single node.
`ZERO` = if such node exists, that `U = 0`
`LOW` = !`ZERO` if such node exists, that `U < 50` & `R <= 8` | `U < 80` & `R > 8` holds
`NORM` = !`ZERO` & !`LOW` & such node exists, that `U > 50` & `R <= 8` | `U > 80` & `R > 8` holds
`HIGH` = !`ZERO` & !`LOW` & !`NORM`
![node_cpu_usage_max](img/node_cpu_usage_max.svg)
......@@ -90,3 +132,41 @@ It simply divides the job into 2 categories: running on a single node and runnin
`ONE` - if the number of nodes equals 1
`MULT` - if the number of nodes is greater than 1
## mem_usage_total
The attribute for the ratio between sum of maximum memory usage per node and total requested memory.
**Values**:
`HIGH` - The maximum memory usage is in the tolerable range
`LOW` - The maximum memory usage is lower than the tolerable bound
**Calculation**:
`U` - the ratio of the global sum of maximum sums of memory usage (RSS) of all processes on the nodes to requested memory `R`.
`R` - the sum of requested amount of memory on every node in GiB.
`LOW` = `U < 30` & `R > 32`
`HIGH` = !`LOW`
![cpu_usage_total_max](img/mem_usage_total.svg)
## overloaded_node_exists
It is `True` if the node has a high load during the interval `D`.
**Values**:
`True` - if such node exists
`False` - otherwise
**Calculation**:
If during the interval `D` (300 seconds) consequent measurements of `load1` on the node exceeds the amount of cores the node has, then the value is `True`.
`False` otherwise.
This diff is collapsed.
import matplotlib.pyplot as plt
import numpy as np
XLIM = 512
YLIM = 100
x_const = 32
tx = np.arange(1.0, XLIM)
plt.axis([1, XLIM, 0, YLIM])
plt.ylabel('Memory usage (U, %)')
plt.xlabel('Requested memory (R)GiB')
plt.grid(True, which="both", linestyle='--')
# lines
y_is_30 = tx*0 + 30
x_is_const = [x_const, x_const], [0, YLIM]
# plot the lines
plt.plot(tx, y_is_30, label="U=30%")
plt.plot(*x_is_const, label="R=32")
# highlight the area
plt.fill_between(tx, y_is_30, where=tx >= x_const, color='red', alpha=0.3)
plt.legend(loc='lower right')
plt.savefig('../../img/mem_usage_total.svg')
plt.show()
......@@ -2,7 +2,7 @@ import sys
import math
from db.aggrstruct import *
LONG_DUR_SEC = 60
LONG_DUR_SEC = 300
def accepts(*types, **kw):
'''Function decorator. Checks decorated function's arguments are
......@@ -51,7 +51,7 @@ def node_has_high_load_interval(node):
conseq_points = 0
max_points = ceil(SeqVals.delta / LONG_DUR_SEC)
max_load = node.sockets * node.cores_per_socket
max_load = node.sockets * node.cores_per_socket * (node.phys_thr_core + node.virt_thr_core)
seq = node.seq_load_max
for p in seq.seq:
......
RULES = [
{
"attrs": {
"cpu_usage_total": "HIGH"},
"cpu_usage_total_max": "HIGH"},
"msg": "request more compute units"},
{
"attrs": {
"cpu_usage_total": "LOW",
"cpu_usage_total_max": "LOW",
"mem_usage_total": "LOW"},
"msg": "request less compute units"},
{
......@@ -16,10 +16,6 @@ RULES = [
"attrs": {
"min_proc_ratio": "HIGH"},
"msg": "All nodes have more processes than requested"},
{
"attrs": {
"min_proc_ratio": "ZERO"},
"msg": "Some nodes were not used at all"},
{
"attrs": {
"overloaded_node_exists": True},
......@@ -33,19 +29,36 @@ RULES = [
"job_nodes_amount": "MULT",
"req_runtime": "NORM",
"cpu_usage_total_max": "LOW",
"node_cpu_usage_max" : "LOW"},
"node_cpu_usage_min": "LOW",
"node_cpu_usage_max": "NORM"},
"msg": "The CPU usage on some nodes is low, please request less cores or increase amount of processes"},
{
"attrs": {
"job_nodes_amount": "MULT",
"req_runtime": "NORM",
"cpu_usage_total_max": "LOW",
"node_cpu_usage_max" : "NORM"},
"node_cpu_usage_min": "ZERO",
"node_cpu_usage_max": "NORM"},
"msg": "Some nodes were not used during the runtime"},
{
"attrs": {
"job_nodes_amount": "MULT",
"req_runtime": "NORM",
"cpu_usage_total_max": "NORM",
"node_cpu_usage_min": "LOW",
"node_cpu_usage_max": "NORM"},
"msg": "The CPU usage is not distributed equally among the nodes. Try to use the nodes evenly"},
{
"attrs": {
"job_nodes_amount": "MULT",
"req_runtime": "NORM",
"cpu_usage_total_max": "LOW",
"node_cpu_usage_min": "LOW",
"node_cpu_usage_min": "LOW"},
"msg": "The CPU usage of the job is low on all nodes, please request appropriate amount of resources"},
{
"attrs": {
"job_nodes_amount": "ONE",
"req_runtime": "NORM",
"node_cpu_usage_max" : "LOW"},
"node_cpu_usage_max": "LOW"},
"msg": "The CPU usage of the node is low. It might indicate that the job is not running in full power"},
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment