xls.py

#!/usr/local/bin/python3
""" a simple script that converts ilias exam output to readable json

usage: convert.py [-h] [-u USERNAMES] [-n NUMBER_OF_TASKS] INFILE OUTFILE

positional arguments:
  INFILE                Ilias exam data
  OUTFILE               Where to write the final file

optional arguments:
  -h, --help            show this help message and exit
  -u USERNAMES, --usernames USERNAMES
                        a json dict matno -> email
  -n NUMBER_OF_TASKS, --NUMBER_OF_TASKS NUMBER_OF_TASKS
                        Where to write the final file


Author: Jan Maximilian Michal
Date: 30 March 2017
"""

import json
import os
import re
import urllib.parse
from collections import defaultdict, namedtuple

from xlrd import open_workbook

import lib.generic


class XLSConverter(lib.generic.Converter):
    """docstring for XLSConverter"""

    accepted_files = ('.xls',)

    def convert(self, filepath):
        return converter(filepath)


# one user has one submission (code) per task
# yes, I know it is possible to name match groups via (?P<name>) but
# I like this solution better since it gets the job done nicely
user_t = namedtuple('user_head', 'name matrikel_no')

# one task has a title and id and hpfly code
task_head_re = re.compile(r'^Quellcode Frage (?P<title>.*?) ?(\d{8})?$')

# nor parsing the weird mat no
matno_re = re.compile(r'^(?P<matrikel_no>\d+)-(\d+)-(\d+)$')

TABWIDTH = 4


def converter(infile, usernames=None, number_of_tasks=0):

    # Modify these iterators in order to change extraction behaviour

    def sheet_iter_meta(sheet, silent=True):
        """ yield first and second col entry as tuple of (name, matnr) """
        for row in (sheet.row(i) for i in range(1, sheet.nrows)):
            match = re.search(matno_re, row[1].value)
            if match:
                if not silent and len(match.group('matrikel_no')) != 8:
                    print('[WARN] %s has odd matrikelno %s' %
                          (row[0].value, match.group('matrikel_no')))
                yield row[0].value, match.group('matrikel_no')
            else:
                if not silent:
                    print('[WARN] could not parse row %s' % row[0])
                yield row[0].value, row[1].value

    def sheet_iter_data(sheet):
        """ yields all source code title and code tuples """
        def row(i):
            return sheet.row(i)
        for top, low in ((row(i), row(i + 1)) for i in range(sheet.nrows - 1)):
            if any(map(lambda c: c.ctype, top)) and 'Quell' in top[0].value:
                yield (' '.join(c.value for c in top),
                       ' '.join(c.value for c in low))

    # meta sheet contains ilias names usernames etc - data contains code
    meta, *data = open_workbook(infile, open(os.devnull, 'w')).sheets()

    # nice!
    name2mat = dict(sheet_iter_meta(meta, silent=False))
    assert len(name2mat) == len(data), '{} names != {} sheets'.format(len(name2mat), len(data))  # noqa

    # from xls to lists and namedtuples
    # [ [user0, task0_h, code0, ..., taskn, coden ], ..., [...] ]
    root = []
    tasks = {}
    for user, sheet in zip(sheet_iter_meta(meta), data):
        root.append([user_t(*user)])
        for task, code in sheet_iter_data(sheet):
            task = re.search(task_head_re, task)
            task_title = task.group('title')
            tasks[task_title] = {
                'title': task_title,
                'type': 'SourceCode'
            }
            root[-1].append(task.group('title'))
            root[-1].append(urllib.parse
                            .unquote(code)
                            .replace('\t', ' ' * TABWIDTH))

    if number_of_tasks:
        for (user, *task_list) in sorted(root, key=lambda u: u[0].name):
            assert len(task_list) == number_of_tasks * 2

    mat_to_email = defaultdict(str)
    if usernames:
        with open(usernames) as data:
            mat_to_email.update(json.JSONDecoder().decode(data.read()))

    def get_username(user):
        if name2mat[user.name] in mat_to_email:
            return mat_to_email[name2mat[user.name]].split('@')[0]
        return ''.join(filter(str.isupper, user.name)) + name2mat[user.name]

    usernames = {user.name: get_username(user) for (user, *_) in root}

    return {
        'students': [{
            'fullname': user.name,
            'username': usernames[user.name],
            'email': mat_to_email[name2mat[user.name]],
            'identifier': name2mat[user.name],
            'submissions': [{
                "type": task,
                "code": code,
                "tests": {},
            } for task, code in zip(task_list[::2], task_list[1::2])]
        } for (user, *task_list) in sorted(root, key=lambda u: u[0].name)],
        'tasks': list(tasks.values())
    }


def write_to_file(json_dict, outfile):
    # just encode python style
    with open(outfile, "w") as out:
        json.dump(json_dict, out, indent=2)

    print("Wrote data to %s. Done." % outfile)