Commit 0277095e authored by akhuziy's avatar akhuziy
Browse files

ignoring short jobs + changes in config files

parent 7af1fb94
......@@ -20,6 +20,7 @@ __pycache__
out
/conf/user.py
/conf/influxdb.py
/conf/config.py
htmlcov
.coverage
sonar-project.properties
......
......@@ -19,7 +19,7 @@ python3 -m pip install -r requirements.txt
## Configuration
Samples for configurations can be stored in the repository, please rename corresponding templates with an extension `.sample` to `.py` and change placeholder values accordingly.
Samples for configurations can be stored in the repository, please rename corresponding templates with an extension `.py.sample` to `.py` and change placeholder values accordingly.
Real configs should be ignored in `.gitignore` file.
......
#! /usr/bin/env python3
# DEBUGGING
#############################
DEBUG = False
# INFLUXDB
#############################
# interval of measurements in seconds
METRIC_INTERVAL = 10
# names
measurements = {
"proc": 'pfit-uprocstat',
"jobs": 'pfit-jobinfo',
"node": 'pfit-nodeinfo',
"sys": 'system',
"cpu": 'cpu',
"infiniband": 'infiniband',
"beegfs": 'beegfs_clients',
"gpu_node": 'nvidia_gpu',
"gpu_proc": 'nvidia_proc',
}
# Security
#############################
SLURM_BIN = "" # FULL path to Slurm binaries
LSF_BIN = ""
SECUSER = True # use setuid binary and check users before fetching data
job_uid_comm = {
"lsf": "bjobs -noheader {jobid:s} -o \"user\" | id",
"slurm": SLURM_BIN + "sacct -j {jobid:s} -X -P -n -o \"uid\"",
}
# DB
#############################
job_info = {
"fetch_job_info_lsf": "bjobs -o \"{:s}\" {:s}",
"fetch_job_info_slurm": "sacct --format=\"{:s}\" -X -P -j {:s}",
"measurement_name": "pfit-jobinfo",
}
# Batch system
BATCH_SYSTEM = "SLURM" # LSF, SLURM
MIN_DUR = 50 # minimal duration of the job for generating report in minutes
# Network
INFINIBAND = True # aggregate infiniband data
# Filesystems
BEEGFS = True # aggregate beegfs data
# GPU
GPU = True # aggregat gpu data
# PDF report
SEQ_MAX_POINTS = 500
......@@ -4,6 +4,9 @@ import json
import argparse
import db.aggregator as aggregator
import conf.config as conf
import subprocess
import re
import os
from format import format
from rcm import rcm
......@@ -21,6 +24,38 @@ def merge_and_out(job_id, aggr, rcm, type, out_dir=None):
print("{:s} data was exported in {:s}".format(type, filename))
def check_user(jobid):
euid = os.geteuid()
uid = os.getuid()
if euid == 0 or uid == 0:
return
if conf.BATCH_SYSTEM == "SLURM":
job_uid_comm = conf.job_uid_comm["slurm"].format(jobid = jobid)
elif conf.BATCH_SYSTEM == "LSF":
job_uid_comm = conf.job_uid_comm["lsf"].format(jobid = jobid)
else:
print("Cannot check the user ID, no batch system specified")
exit(1)
result = subprocess.run(job_uid_comm, stdout=subprocess.PIPE, shell=True,
executable='/bin/bash')
out = result.stdout.decode("utf-8")
m = re.search("[0-9]+", out)
if m is None or m.group(0) is None:
print("Cannot parse UID. {:s}".format(out))
exit(1)
job_uid = m.group(0)
if int(job_uid) != int(uid):
print("Access denied. UIDs of the user and the job do not match.")
exit(1)
return
def main():
parser = argparse.ArgumentParser(description="""
Gets the job information required for generating text or PDF reports
......@@ -37,13 +72,18 @@ def main():
help="job ID used in the batch system")
args = parser.parse_args()
job_id = args.JOBID
if conf.SECUSER:
check_user(job_id)
# Errors in arguments
if args.output_dir is None:
if args.type == "all":
print("Cannot print both data in STDOUT")
exit()
# End of errors in arguments
job_id = args.JOBID
aggr = aggregator.get_aggregator(job_id, "pdf")
......
# db/common.py common functions
import requests
import json
import sys
from requests.auth import HTTPBasicAuth
from conf import influxdb as confdb
from . import metrics
def fetch_data(query):
payload = {'db': confdb.IDB["database"], 'q': query}
query_url = "{:s}/query".format(confdb.IDB["api_url"])
......@@ -23,7 +23,6 @@ def fetch_data(query):
return result
# format metric
def format_metric(metrics_dict, metric, value):
type = metrics_dict[metric]["type"]
......@@ -42,3 +41,7 @@ def format_value(fmt, value):
return float(value)
return None
def eprint(*args, err=1, **kwargs):
print(*args, file=sys.stderr, **kwargs)
exit(err)
......@@ -5,6 +5,7 @@ from .influxdb_fetchnode import get_node_data
from .influxdb_fetchseq import get_seq_data
from .influxdb_fetchgpu import get_gpu_data
from ..aggrstruct import *
from . import common
def fetch_all(job_id, type):
data = {}
......@@ -12,6 +13,12 @@ def fetch_all(job_id, type):
t_start = data["job"]["start_time"]
t_end = t_start + data["job"]["run_time"]
if data["job"]["run_time"] < conf.MIN_DUR * 60:
common.eprint("The runtime of the job ({dur:d} seconds) is too short. Minimum is {min:d} seconds".format(
dur = int(data["job"]["run_time"]),
min = int(conf.MIN_DUR * 60),
))
data["proc"] = get_proc_data(job_id)
node_ids = list(data["proc"].keys())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment