Compare revisions

Jan Maximilian Michal · Jan Maximilian Michal · Jan Maximilian Michal · Jan Maximilian Michal · Jan Maximilian Michal · Jan Maximilian Michal
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ deploy.sh
 .DS_Store
 *.xls
 .venv/
+testall.sh
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-image: python:3.4
+image: python:3.5

 before_script:
  - python -V
-  
+  - pip install -e .
+
+variables:
+  PIP_CACHE_DIR: "$CI_PROJECT_DIR/pip-cache"
+
 cache:
  paths:
-  - ~/.cache/pip/
+    - "$CI_PROJECT_DIR/pip-cache"

-test:
+flake8:
  script:
-  - pip install -e .
  - pip install flake8
  - flake8 hektor.py bin lib
+
+test:
+  script:
+  - hektor -h
--- a/Makefile
+++ b/Makefile
+.PHONY: dist clean upload tag help
+
+help:
+	@echo "See Makefile itself for help"
+
+clean:
+	rm -r hektor.egg-info dist build __pycache__
+
+dist:
+	pip install -U setuptools pip wheel && \
+	python setup.py bdist_wheel --universal
+
+upload: dist
+	twine upload dist/*
+
+tag:
+	git tag `python setup.py --version`
--- a/bin/hektor
+++ b/bin/hektor
 #!/usr/bin/env python3

-import hektor
 import sys

+import hektor
+
 if __name__ == '__main__':
-    if sys.version_info < (3, 4):
+    if sys.version_info < (3, 5):
        sys.exit("At least Python 3.4 is required.")
    hektor.main()
--- a/hektor.py
+++ b/hektor.py
 import argparse
+import base64
 import functools
+import getpass
 import json
 import logging
 import os
-from typing import Any, Callable, Dict, Sequence
+from typing import Any, Callable, Dict, List, Sequence, Union

+from lib import Converter
 from xkcdpass import xkcd_password as xp

-
-from lib import Converter
+try:
+    from cryptography.fernet import Fernet
+except ImportError:
+    Fernet = None


 # ============================== =- Logging -= ============================== #
-log = logging.getLogger(__name__)
-log.setLevel(logging.DEBUG)
+def setup_logging():
+    ''' Make the logger globally available by hide intermediate handler,
+    filters and formatter variables '''
+    global log

-# create console handler and formatter
-console = logging.StreamHandler()
-console.setLevel(logging.DEBUG)
-formatter = logging.Formatter('[%(levelname)s] %(message)s')
+    level = logging.DEBUG if args.verbose else logging.INFO

-# add formatter to console handler
-console.setFormatter(formatter)
-log.addHandler(console)
+    log = logging.getLogger(__name__)
+    log.setLevel(level)
+
+    # create console handler and formatter
+    console = logging.StreamHandler()
+    console.setLevel(level)
+    formatter = logging.Formatter('[%(levelname)s] %(message)s')
+
+    # add formatter to console handler
+    console.setFormatter(formatter)
+    log.addHandler(console)


 # ============================= =- argparse -= ============================== #
-def parseme():
+def setup_argparse():
+    global args
+
    def file_exists(parser, filepath: str) -> str:
        if not os.path.isfile(filepath):
            parser.error('Not a file %s' % filepath)
        return filepath

    parser = argparse.ArgumentParser()
+
+    # General purpose arguments
+    parser.add_argument(
+        '-v', '--verbose',
+        action='store_true',
+        help='enable verbose logging (Level: DEBUG)')
+
+    # Input output files
    parser.add_argument(
        'input',
        metavar='DATA',
@@ -42,7 +64,21 @@ def parseme():
        'output',
        metavar='OUTFILE',
        help='destination of converter output (JSON)')
-    parser.add_argument(
+
+    # Post-processor flags
+    remove_personal = parser.add_mutually_exclusive_group()
+    remove_personal.add_argument(
+        '-e', '--encrypt',
+        action='store_true',
+        help='''strip all personal information and provide decryption key
+                (AES 128-bit, CBC mode, PKCS7 for padding, HMAC with SHA-256
+                for integrity)'''
+    )
+    remove_personal.add_argument(
+        '-d', '--decrypt',
+        action='store_true',
+        help='Reverse previous AES encryption.')
+    remove_personal.add_argument(
        '-a', '--anonymous',
        action='store_true',
        help='replace personal information and create a reversing table')
@@ -51,71 +87,76 @@ def parseme():
        help='where to store personal information (CSV)',
    )
    parser.add_argument(
-        '-m', '--meta',
+        '-m', '--add-meta',
        action='store_true',
        help='add meta information (lecturer, course title)'
    )
+    parser.add_argument(
+        '--verify',
+        action='store_true',
+        default=True,
+        help='asserts that output data will be in a certain format'
+    )
+    parser.add_argument(
+        '-r', '--readable-code',
+        action='store_true',
+        help='make student code readable by inserting artificial line breaks')

    args = parser.parse_args()

-    if args.anonymous != (args.personal_secret_table is not None):
-        parser.error('Need an output for anonymous mode')
+    if (args.decrypt or args.encrypt) and Fernet is None:
+        parser.error('To use AES encryption, install cryptography via pip')

-    return args
+    if args.anonymous != (args.personal_secret_table is not None):
+        parser.error('Please specify where to write the mapping (see -t)')


 # ========================== =- General Purpose -= ========================== #
 def compose(*functions: Sequence[Callable]) -> Callable:
    """ Standard function composition. Takes a Sequence of functions [f, g, h, ...]
    and returns the composite function i(x) = f(g(h(x))). There are no checks
-    that validate if domain and image of these functions are compatible."""
+    that validate if domain and image of these functions are compatible. """
    return functools.reduce(lambda f,
                            g: lambda x: f(g(x)),
                            functions,
                            lambda x: x)


-# ========================== =- Post processors -= ========================== #
-def anonymise(structured_data: Dict[str, Any]) -> Dict[str, Any]:
-    DELIMITER = '-'
-    wordfile = xp.locate_wordfile()
-    words = xp.generate_wordlist(wordfile=wordfile,
-                                 min_length=7,
-                                 max_length=7)
+def abort(message='Bye.'):
+    ''' In case anything goes wrong. Basically a dump wrapper around exit '''
+    log.info(message)
+    exit(1)

-    def get_identifier():
-        return xp.generate_xkcdpassword(words, numwords=2, delimiter=DELIMITER)

-    students = structured_data.pop('students')
-    reverser = {get_identifier(): s for s in students.values()}
-    students_anon = {r: {
-        'fullname': ' '.join(w[0].capitalize() + w[1:]
-                             for w in r.split(DELIMITER)),
-        'identifier': r,
-        'submissions': student['submissions']
-    } for r, student in zip(reverser, students.values())}
+# ========================== =- Post processors -= ========================== #
+def student_replacer(processor):
+    ''' A simple decorator that is used to remove students and put them back in
+    when the preprocessor is dome with them'''

-    with open(args.personal_secret_table, 'w') as out:
-        print('key, previous identifier, fullname', file=out)
-        print('\n'.join(anon + '\t' + '\t'.join(v
-                                                for v in data.values()
-                                                if type(v) is str)
-                        for anon, data in reverser.items()), file=out)
+    @functools.wraps(processor)
+    def processor_closure(structured_data: Dict[str, Any]) -> Dict[str, Any]:
+        students = structured_data.pop('students')
+        students_replacement = processor(students)
+        structured_data['students'] = students_replacement
+        return structured_data

-    structured_data.update({'students': students_anon})
-    return structured_data
+    return processor_closure


-def add_meta_information(structured_data: Dict[str, Any]) -> Dict[str, Any]:
-    if args.meta:
-        structured_data['author'] = input('[Q] author: ')
-        structured_data['exam'] = input('[Q] course title: ')
+def do_add_meta(structured_data: Dict[str, Any]) -> Dict[str, Any]:
+    ''' Asks the user for metadata about the exam '''
+    structured_data['author'] = input('[Q] author: ')
+    structured_data['exam'] = input('[Q] course title: ')
    return structured_data


-def assert_correct_format(structured_data: Dict[str, Any]) -> Dict[str, Any]:
+def do_verify(structured_data: Dict[str, Any]) -> Dict[str, Any]:
+    ''' The is the testable specification of the format that is output by
+    hector. Since multiple formats are compiled into this one verification is
+    on by default. The impact on performance is neglectable. '''
    def assert_submission(submission):
        assert 'code' in submission, 'A submission needs code'
+        assert type(submission['code']) in [str, list], 'Code is readable'
        assert 'type' in submission, 'A submission has to be of some type'
        assert 'tests' in submission, 'A tests dict has to be present.'

@@ -124,21 +165,34 @@ def assert_correct_format(structured_data: Dict[str, Any]) -> Dict[str, Any]:
                                         len(student['submissions'])))
        assert 'fullname' in student, 'Student needs a name %s' % student
        assert 'identifier' in student, 'Student needs a unique identifier'
+        assert 'username' in student, 'Student needs a unique username'

    def base_assert():
        assert 'students' in structured_data, 'No students found'
        assert 'tasks' in structured_data, 'No tasks found'

+    def assert_task(task):
+        assert 'type' in task, 'Task has no type'
+        assert 'title' in task, 'Task must have a title'
+
    try:
        base_assert()
-        students = structured_data['students'].values()
-        number_of_submissions = len(structured_data['tasks'])
+        students = structured_data['students']
+        tasks = structured_data['tasks']
+        number_of_submissions = len(tasks)
+
+        for task in tasks:
+            try:
+                assert_task(task)
+            except AssertionError as err:
+                raise err
+
        for student in students:

            try:
                assert_student(student)
                assert number_of_submissions == len(student['submissions']), \
-                    '%s does not have enough submissoins' % student['fullname']
+                    '%s does not have enough submissions' % student['fullname']
                for submission in student['submissions']:

                    try:
@@ -155,11 +209,131 @@ def assert_correct_format(structured_data: Dict[str, Any]) -> Dict[str, Any]:
    return structured_data


-post_processors = [
-    anonymise,
-    # add_meta_information,
-    # assert_correct_format
-]
+@student_replacer
+def do_readable_code(students: Dict[str, Union[str, List]]):
+    for student in students:
+        for submission in student['submissions']:
+            submission['code'] = submission['code'].split('\n')
+    return students
+
+
+@student_replacer
+def do_anonymous(students: Dict[str, Union[str, List]]):
+    ''' Recreates most of the data and includes fields over a whitelist
+    therefore ensuring that no personal information remains in the data '''
+    DELIMITER = '-'
+    wordfile = xp.locate_wordfile()
+    words = xp.generate_wordlist(wordfile=wordfile,
+                                 min_length=7,
+                                 max_length=7)
+
+    def get_random_xkcd_identifier():
+        return xp.generate_xkcdpassword(words, numwords=2, delimiter=DELIMITER)
+
+    reverser = {get_random_xkcd_identifier(): s for s in students}
+    students_anonymous = [{
+        'fullname': ' '.join(w[0].capitalize() + w[1:]
+                             for w in pseudo_identifier.split(DELIMITER)),
+        'identifier': pseudo_identifier,
+        'username': pseudo_identifier,
+        'submissions': student['submissions']
+    } for pseudo_identifier, student in reverser.items()]
+
+    with open(args.personal_secret_table, 'w') as out:
+        print('key;previous identifier;fullname', file=out)
+        print('\n'.join('%s;%s;%s' % (anonymous_key,
+                                      data['identifier'],
+                                      data['fullname'])
+                        for anonymous_key, data in reverser.items()), file=out)
+
+    return students_anonymous
+
+
+@student_replacer
+def do_encrypt(students):
+
+    # Init Crypto. See the module documentation on what actually happens here,
+    # then read all about those methods and then go study number theory. Never
+    # roll your own custom crypto ;-)
+    key = Fernet.generate_key()
+    aes = Fernet(key)
+
+    def encrypt(clear: str) -> str:
+        return base64.b64encode(aes.encrypt(clear.encode())).decode('utf-8')
+
+    output_the_key_to_the_user(key)
+    return transform(students, encrypt)
+
+
+@student_replacer
+def do_decrypt(students):
+
+    def decrypt(cipher: str) -> str:
+        return aes.decrypt(base64.b64decode(cipher.encode())).decode('utf-8')
+
+    try:
+        key = getpass.getpass('[Q] Give me the decryption key: ')
+        aes = Fernet(key)
+
+        return transform(students, decrypt)
+    except Exception as err:
+        abort('Your key is bad (%s).' % err)
+
+
+# ======================= =- Post processor helper -= ======================= #
+def transform(students, function):
+    return [
+        {'fullname': function(student['fullname']),
+         'identifier': function(student['identifier']),
+         'username': function(student['username']),
+         'submissions': student['submissions']} for student in students
+    ]
+
+
+def output_the_key_to_the_user(key: bytes):
+
+    def to_file(filepath: str):
+        with open(filepath, 'wb') as file:
+            file.write(key)
+        log.info('Key written to %s. Keep it safe.', filepath)
+
+    def to_stdout():
+        print('Encrypted and signed. Key this key safe or bad things happen')
+        print('   --------->> %s <<---------   ' % key.decode('latin-1'))
+
+    output = input('[Q] The data has been encrypted. ' +
+                   'Where should I put the key? (stdout) ') or 'stdout'
+
+    if output == 'stdout':
+        to_stdout()
+
+    elif not os.path.exists(output):
+        to_file(output)
+
+    elif os.path.isfile(output):
+        confirm = input('[Q] File exists. Want to override? (Y/n)') or 'y'
+        if confirm.lower().startswith('y'):
+            to_file(output)
+        else:
+            abort('No data was written. Bye.')
+
+    else:
+        log.error('I cannot write to %s.', output)
+        abort()
+
+
+def get_active_postprocessors():
+    postprocessor_order = (
+        do_add_meta,
+        do_verify,
+        do_readable_code,
+        do_anonymous,
+        do_encrypt,
+        do_decrypt
+    )
+
+    return (p for p in postprocessor_order
+            if getattr(args, p.__name__.split('do_')[1]))


 # ============================== =- Hektor -= =============================== #
@@ -168,6 +342,8 @@ def _preprocessing(filepath: str) -> str:


 def _processing(filepath: str) -> Dict[str, Any]:
+    ''' Find the first apropriate converter and run pass it the path to the
+    datafile. '''
    try:
        return next(converter().convert(filepath)
                    for converter in Converter.implementations()
@@ -177,21 +353,26 @@ def _processing(filepath: str) -> Dict[str, Any]:
                  ', '.join(f
                            for c in Converter.implementations()
                            for f in c.accepted_files))
+        abort('Program stopped prematurely. No data was written. Bye.')


 def _postprocessing(structured_data: Dict[str, Any]) -> Dict[str, Any]:
-    return compose(*post_processors)(structured_data)
+    return compose(*get_active_postprocessors())(structured_data)


 def main():
-    global args
-    args = parseme()
+    setup_argparse()
+    setup_logging()
+
+    log.debug('Active post processors %s', list(get_active_postprocessors()))

    processing = compose(_postprocessing, _processing, _preprocessing)
    data = processing(args.input)
+
    destination = args.output.split('.json')[0] + '.json'
    with open(destination, 'w') as output:
        json.dump(data, output, indent=2, sort_keys=True)
+
    log.info('Wrote exam data to %s', destination)



--- a/lib/__init__.py
+++ b/lib/__init__.py
@@ -3,3 +3,4 @@
 from lib.generic import Converter  # noqa
 from lib.qti import QTIConverter  # noqa
 from lib.xls import XLSConverter  # noqa
+from lib.identity import JSONIdentityConverter  # noqa
--- a/lib/generic.py
+++ b/lib/generic.py
@@ -7,7 +7,8 @@ def all_subclasses(cls):


 class Converter(metaclass=abc.ABCMeta):
-    """ A base class if we incorporate more converters in the future """
+    """ A base class if we incorporate more converters in the future. New
+    implementations need to be registered in this modules __init__.py """

    @abc.abstractmethod
    def convert(self):

--- a/lib/identity.py
+++ b/lib/identity.py
+import json
+
+import lib.generic
+
+
+class JSONIdentityConverter(lib.generic.Converter):
+    """ This serves as an identity if you wish to import a json file
+    that you generated earlier with hektor and you now want to run a
+    preprocessor on it. """
+
+    accepted_files = ('.json',)
+
+    def convert(self, filepath):
+        with open(filepath) as json_input:
+            return json.load(json_input)
--- a/lib/qti.py
+++ b/lib/qti.py
@@ -2,13 +2,12 @@ import base64
 import re
 import zipfile

-from lxml import etree
-
 import lib.generic
+from lxml import etree


 class QTIConverter(lib.generic.Converter):
-    """docstring for XLSConverter"""
+    """ XLSConverter class (Currently raw xml input is not supported) """

    accepted_files = ('.zip', '.xml')

@@ -51,12 +50,11 @@ def process_qti(tree, only_of_type=('assSourceCode',), **kwargs):


 def process_users(results_tree):
-    return {row.attrib['active_id']: dict(row.attrib)
-            for row in results_tree.xpath(users)}
+    return [dict(row.attrib) for row in results_tree.xpath(users)]


 def convert_code(text):
-    return base64.b64decode(text).decode('utf-8').split('\n')
+    return base64.b64decode(text).decode('utf-8')


 def process_solutions(results_tree, task_id):
@@ -67,14 +65,15 @@ def process_solutions(results_tree, task_id):
 def process_results(tree, qti=(), **kwargs):
    questions = qti
    users = process_users(tree)
-    for user in users.values():
+    id2user = {user['active_id']: user for user in users}
+    for user in users:
        user['submissions'] = []
-    for question in questions:
-        solutions = process_solutions(tree, question)
+    for question_key, question in questions.items():
+        solutions = process_solutions(tree, question_key)
        for user_id, solution in solutions.items():
-            users[user_id]['submissions'].append({'type': question,
-                                                  'code': solution,
-                                                  'tests': {}})
+            id2user[user_id]['submissions'].append({'type': question['title'],
+                                                    'code': solution,
+                                                    'tests': {}})
    return users


@@ -128,8 +127,9 @@ ignore_user_fields = ("user_fi",


 def add_users(base, data):
-    for userdata in data['results'].values():
+    for userdata in data['results']:
        userdata['identifier'] = userdata['user_fi']
+        userdata['username'] = userdata['user_fi']
        for field in ignore_user_fields:
            userdata.pop(field)
    base['students'] = data['results']

--- a/lib/xls.py
+++ b/lib/xls.py
 #!/usr/local/bin/python3
 """ a simple script that converts ilias exam output to readable json

-The json output will look like this:
-{
-    "max.mustermann": { <<--- OR all uppercase letter of the name + username/matrikel_no  # noqa: E501
-        "matrikel_no": "12345678",
-        "name": "Mustermann, Max",
-        "task_list": {
-            "[task_id_1]": "print Hello World!",
-            ....,
-            "[task_id_n]": "#include <stdio.h> etc."
-        }
-    },
-    ... ans so on
-}
-
 usage: convert.py [-h] [-u USERNAMES] [-n NUMBER_OF_TASKS] INFILE OUTFILE

 positional arguments:
@@ -39,9 +25,8 @@ import re
 import urllib.parse
 from collections import defaultdict, namedtuple

-from xlrd import open_workbook
-
 import lib.generic
+from xlrd import open_workbook


 class XLSConverter(lib.generic.Converter):
@@ -62,24 +47,31 @@ user_t = namedtuple('user_head', 'name matrikel_no')
 task_head_re = re.compile(r'^Quellcode Frage (?P<title>.*?) ?(\d{8})?$')

 # nor parsing the weird mat no
-matno_re = re.compile(r'^(?P<matrikel_no>\d{8})-(\d+)-(\d+)$')
+matno_re = re.compile(r'^(?P<matrikel_no>\d+)-(\d+)-(\d+)$')

-COLUMNS_BEFORE_TASKS = 19
+TABWIDTH = 4


-def converter(infile, usernames=None, number_of_tasks=0,):
+def converter(infile, usernames=None, number_of_tasks=0):

    # Modify these iterators in order to change extraction behaviour

-    def sheet_iter_meta(sheet):
+    def sheet_iter_meta(sheet, silent=True):
        """ yield first and second col entry as tuple of (name, matnr) """
        for row in (sheet.row(i) for i in range(1, sheet.nrows)):
            match = re.search(matno_re, row[1].value)
            if match:
+                if not silent and len(match.group('matrikel_no')) != 8:
+                    print('[WARN] %s has odd matrikelno %s' %
+                          (row[0].value, match.group('matrikel_no')))
                yield row[0].value, match.group('matrikel_no')
+            else:
+                if not silent:
+                    print('[WARN] could not parse row %s' % row[0])
+                yield row[0].value, row[1].value

    def sheet_iter_data(sheet):
-        """ yields all source code titel and code tuples """
+        """ yields all source code title and code tuples """
        def row(i):
            return sheet.row(i)
        for top, low in ((row(i), row(i + 1)) for i in range(sheet.nrows - 1)):
@@ -91,8 +83,8 @@ def converter(infile, usernames=None, number_of_tasks=0,):
    meta, *data = open_workbook(infile, open(os.devnull, 'w')).sheets()

    # nice!
-    name2mat = dict(sheet_iter_meta(meta))
-    assert len(name2mat) == len(data), f'{len(name2mat)} names != {len(data)} sheets'  # noqa
+    name2mat = dict(sheet_iter_meta(meta, silent=False))
+    assert len(name2mat) == len(data), '{} names != {} sheets'.format(len(name2mat), len(data))  # noqa

    # from xls to lists and namedtuples
    # [ [user0, task0_h, code0, ..., taskn, coden ], ..., [...] ]
@@ -108,7 +100,9 @@ def converter(infile, usernames=None, number_of_tasks=0,):
                'type': 'SourceCode'
            }
            root[-1].append(task.group('title'))
-            root[-1].append(urllib.parse.unquote(code).strip())
+            root[-1].append(urllib.parse
+                            .unquote(code)
+                            .replace('\t', ' ' * TABWIDTH))

    if number_of_tasks:
        for (user, *task_list) in sorted(root, key=lambda u: u[0].name):
@@ -127,20 +121,17 @@ def converter(infile, usernames=None, number_of_tasks=0,):
    usernames = {user.name: get_username(user) for (user, *_) in root}

    return {
-        'students': {
-            usernames[user.name]: {
-                'fullname': user.name,
-                'email': mat_to_email[name2mat[user.name]],
-                'identifier': name2mat[user.name],
-                'submissions': [
-                    {
-                        "type": task,
-                        "code": code,
-                        "tests": {},
-                    } for task, code in zip(task_list[::2], task_list[1::2])
-                ]
-            } for (user, *task_list) in sorted(root, key=lambda u: u[0].name)
-        },
+        'students': [{
+            'fullname': user.name,
+            'username': usernames[user.name],
+            'email': mat_to_email[name2mat[user.name]],
+            'identifier': name2mat[user.name],
+            'submissions': [{
+                "type": task,
+                "code": code,
+                "tests": {},
+            } for task, code in zip(task_list[::2], task_list[1::2])]
+        } for (user, *task_list) in sorted(root, key=lambda u: u[0].name)],
        'tasks': list(tasks.values())
    }

@@ -150,4 +141,4 @@ def write_to_file(json_dict, outfile):
    with open(outfile, "w") as out:
        json.dump(json_dict, out, indent=2)

-    print(f"Wrote data to {outfile}. Done.")
+    print("Wrote data to %s. Done." % outfile)
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ from setuptools import setup

 setup(
    name='hektor',
-    version='0.2',
+    version='0.3.5',
    description='A QTI-XML/XLS to JSON converter for humans',
    author='Jan Maximilian Michal',
    author_email='mail@janmax.org',
@@ -14,5 +14,6 @@ setup(
    install_requires=["lxml~=4.1.1",
                      "xlrd~=1.1.0",
                      "xkcdpass~=1.16.0"],
-    py_modules=['hektor']
+    py_modules=['hektor'],
+    packages=['lib']
 )
No results found