Fix training data generation. Refactoring

434d66d1 · cboulanger · e41b3f96 · 434d66d1
Commit 434d66d1 authored 1 year ago by cboulanger
--- a/mlx/lora/finetune-experiments.ipynb
+++ b/mlx/lora/finetune-experiments.ipynb
@@ -142,6 +142,75 @@
   },
   "id": "a9dff0d6c779882c"
  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Set paths for model"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "f9ba088a74f8c557"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "HF_MODEL_PATH=mistralai/Mistral-7B-Instruct-v0.2\n",
+      "LOCAL_MODEL_PATH=mlx_models/mistralai/Mistral-7B-Instruct-v0.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "HF_MODEL_PATH = 'mistralai/Mistral-7B-Instruct-v0.2'\n",
+    "LOCAL_MODEL_PATH = f'mlx_models/{HF_MODEL_PATH}'\n",
+    "os.environ['HF_MODEL_PATH'] = HF_MODEL_PATH\n",
+    "os.environ['LOCAL_MODEL_PATH'] = LOCAL_MODEL_PATH\n",
+    "print(f\"\"\"\n",
+    "HF_MODEL_PATH={HF_MODEL_PATH}\n",
+    "LOCAL_MODEL_PATH={LOCAL_MODEL_PATH}\n",
+    "\"\"\".strip())\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-03-01T08:55:33.087209Z",
+     "start_time": "2024-03-01T08:55:33.080961Z"
+    }
+   },
+   "id": "203bf0c10dd860a5"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Create a 4-Bit quantized model if necessary"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "a52bdff5b0eae3bd"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "outputs": [],
+   "source": [
+    "![ -d \"$LOCAL_MODEL_PATH\" ] || python convert.py --hf-path \"$HF_MODEL_PATH\" --mlx-path \"$LOCAL_MODEL_PATH\" -q"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-03-01T06:51:00.898588Z",
+     "start_time": "2024-03-01T06:51:00.764741Z"
+    }
+   },
+   "id": "fdb9ec6772be0c23"
+  },
  {
   "cell_type": "markdown",
   "source": [
@@ -208,7 +277,6 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-03-01T08:54:57.837466Z",
     "start_time": "2024-03-01T08:54:56.039305Z"
    }
   },
@@ -248,71 +316,11 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-03-01T08:54:59.410535Z",
     "start_time": "2024-03-01T08:54:59.402009Z"
    }
   },
   "id": "6181ba9486346975"
  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "HF_MODEL_PATH=mistralai/Mistral-7B-Instruct-v0.2\n",
-      "LOCAL_MODEL_PATH=mlx_models/mistralai/Mistral-7B-Instruct-v0.2\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "HF_MODEL_PATH = 'mistralai/Mistral-7B-Instruct-v0.2'\n",
-    "LOCAL_MODEL_PATH = f'mlx_models/{HF_MODEL_PATH}'\n",
-    "os.environ['HF_MODEL_PATH'] = HF_MODEL_PATH\n",
-    "os.environ['LOCAL_MODEL_PATH'] = LOCAL_MODEL_PATH\n",
-    "print(f\"\"\"\n",
-    "HF_MODEL_PATH={HF_MODEL_PATH}\n",
-    "LOCAL_MODEL_PATH={LOCAL_MODEL_PATH}\n",
-    "\"\"\".strip())\n"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-03-01T08:55:33.087209Z",
-     "start_time": "2024-03-01T08:55:33.080961Z"
-    }
-   },
-   "id": "203bf0c10dd860a5"
-  },
-  {
-   "cell_type": "markdown",
-   "source": [
-    "### Create a 4-Bit quantized model if necessary"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "a52bdff5b0eae3bd"
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "outputs": [],
-   "source": [
-    "![ -d \"$LOCAL_MODEL_PATH\" ] || python convert.py --hf-path \"$HF_MODEL_PATH\" --mlx-path \"$LOCAL_MODEL_PATH\" -q"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-03-01T06:51:00.898588Z",
-     "start_time": "2024-03-01T06:51:00.764741Z"
-    }
-   },
-   "id": "fdb9ec6772be0c23"
-  },
  {
   "cell_type": "markdown",
   "source": [

 %% Cell type:markdown id:d6264ff5d5024ba1 tags:

 # Finetuning experiment: Extract structured data for German law journal editors from website text

 based on https://github.com/ml-explore/mlx-examples/tree/main/lora

 Hardware: Mac mini 2023 (M2, 16 GB RAM)

 %% Cell type:markdown id:1135fbc8a6ced279 tags:

 ## Preparation

 ### Download website data

 This only downloads new content if the list of journals has been changed or already downloaded files have been deleted. To overwrite existing files, use `overwrite=True`

 %% Cell type:code id:9eb2effc7bfb22f tags:

 ``` python
 from lib.prepare_training_data import download_input_data
 download_input_data(input_file='data/editors.csv',
                    output_dir='data/website-data',
                    overwrite=False)
 ```

 %% Output


    Downloaded 0 web pages.

 %% Cell type:markdown id:434335a9891b27e7 tags:

 ### Prompt and test data for all experiments

 %% Cell type:code id:b4be7c0872d2fd34 tags:

 ``` python
 system_message ="""
 You are a text processing agent. As instructed below, extract information from the provided content in a structured format without discussing reasoning or providing commentary. Only use source text given as input for data extraction unless specifically asked for inference.
 """

 instruction = """
 Analyze content from a German law journal's website. Your task is to identify members of the editorial board (terms to look for: 'Herausgeber', 'Redakteur', 'Schriftleitung') and the advisory board ('Beirat'). For each identified member, extract and organize their information into the following categories: lastname, firstname, title (including academic titles like 'Dr.' or 'Prof. Dr.' and suffixes such as 'LL.M.'), position (their job title, if provided), affiliation, and role. For 'role', infer the role within the journal from the context (options 'Herausgeber', 'Redaktion', 'Schriftleitung', 'Beirat', or an empty string if the role is unknown).

 - Format the output as a YAML list of dictionaries.
 - Exclude any dictionary entries for which information is not available or relevant fields are empty.
 - Ensure the YAML output is strictly valid. It must be a list of dictionaries.
 """

 example = """
 Here is an example:

 ```yaml
 - lastname: Mustermann
  firstname: Martina
  title: Dr.
  position: Vorsitzender Richterin
  affiliation: Oberlandesgericht Buxtehude
  role: Herausgeber
 ```
 """

 epilog="""
 Adhere to these guidelines to efficiently and accurately process the following content:"
 """

 test_data = """
 Herausgeber:
 Prof. Dr. Stefan Knesebeck, Universität Wuppertal
 Prof. Dr. Dr. h.c. Fritz M. Müller LL.M.(Yale), Universität Wanne-Eickel
 RA Prof. Dr. Vera Valentin, Hochschule für Recht und Sport Edingen
 Prof. Dr. Dr. h.c. Rita Rosenbaum, Universität Tupfingen
 Dr. Ingo Gonzalo de Sanchez, Vorsitzender Richter am Oberlandesgericht Rostock
 Redaktion:
 RA Adam Gengelbach, Unterhachingen
 Ass. iur. Petra Priem, Herrenchiemsee
 """
 ```

 %% Cell type:markdown id:a9dff0d6c779882c tags:

 ## mistralai/Mistral-7B-v0.2

+%% Cell type:markdown id:f9ba088a74f8c557 tags:
+
+### Set paths for model
+
+%% Cell type:code id:203bf0c10dd860a5 tags:
+
+``` python
+import os
+HF_MODEL_PATH = 'mistralai/Mistral-7B-Instruct-v0.2'
+LOCAL_MODEL_PATH = f'mlx_models/{HF_MODEL_PATH}'
+os.environ['HF_MODEL_PATH'] = HF_MODEL_PATH
+os.environ['LOCAL_MODEL_PATH'] = LOCAL_MODEL_PATH
+print(f"""
+HF_MODEL_PATH={HF_MODEL_PATH}
+LOCAL_MODEL_PATH={LOCAL_MODEL_PATH}
+""".strip())
+```
+
+%% Output
+
+    HF_MODEL_PATH=mistralai/Mistral-7B-Instruct-v0.2
+    LOCAL_MODEL_PATH=mlx_models/mistralai/Mistral-7B-Instruct-v0.2
+
+%% Cell type:markdown id:a52bdff5b0eae3bd tags:
+
+### Create a 4-Bit quantized model if necessary
+
+%% Cell type:code id:fdb9ec6772be0c23 tags:
+
+``` python
+![ -d "$LOCAL_MODEL_PATH" ] || python convert.py --hf-path "$HF_MODEL_PATH" --mlx-path "$LOCAL_MODEL_PATH" -q
+```
+
 %% Cell type:markdown id:30521e178126b249 tags:

 ### Generate training, testing and validation files

 %% Cell type:code id:31a2389404720256 tags:

 ``` python
 from lib.prepare_training_data import create_training_file
 import sys

 mistral_ft_instruction = f"""
 # instruction
 {system_message}
 # user
 {instruction}
 {epilog}
 # content
 """

 # the template function receives the instruction, the content to be analyzed, and the expected answer
 def template_fn(instruction: str, content: str, answer: str):
    return f'<s>[INST]{instruction}{content}[/INST]{answer}</s>'

 create_training_file(instruction=mistral_ft_instruction,
                     template_func=template_fn,
                     input_file='data/editors/editors.csv',
                     output_dir='data/editors/mistral',
                     content_dir='data/editors/website-data',
                     max_chars=6000, max_gt_items=5,
                     record_identifier_col="journal_abbr",
                     cols_to_remove = ['journal_abbr', 'website', 'retrieved_on'],
                     column_to_filter_by='lastname',
                     lines_before=2, lines_after=2)
 ```

 %% Output

    Length of generated sequences:
     - max: 5550
     - avg: 2259.182608695652
    Longest sequences:
    DivRuW: 5550
    JurBüro: 5051
    AVR: 4366
    APR: 4350
    AusR: 4244
    BKK: 4078
    DÖD: 3818
    EuZW: 3786
    HRN: 3467
    AuAS: 3272

 %% Cell type:code id:6181ba9486346975 tags:

 ``` python
 print(mistral_ft_instruction)
 ```

 %% Output

    
    # instruction
    
    You are a text processing agent. As instructed below, extract information from the provided content in a structured format without discussing reasoning or providing commentary. Only use source text given as input for data extraction unless specifically asked for inference.
    
    # user
    
    Analyze content from a German law journal's website. Your task is to identify members of the editorial board (terms to look for: 'Herausgeber', 'Redakteur', 'Schriftleitung') and the advisory board ('Beirat'). For each identified member, extract and organize their information into the following categories: lastname, firstname, title (including academic titles like 'Dr.' or 'Prof. Dr.' and suffixes such as 'LL.M.'), position (their job title, if provided), affiliation, and role. For 'role', infer the role within the journal from the context (options 'Herausgeber', 'Redaktion', 'Schriftleitung', 'Beirat', or an empty string if the role is unknown).
    
    - Format the output as a YAML list of dictionaries.
    - Exclude any dictionary entries for which information is not available or relevant fields are empty.
    - Ensure the YAML output is strictly valid. It must be a list of dictionaries.
    
    
    Adhere to these guidelines to efficiently and accurately process the following content:"
    
    # content

-%% Cell type:code id:203bf0c10dd860a5 tags:
-
-``` python
-import os
-HF_MODEL_PATH = 'mistralai/Mistral-7B-Instruct-v0.2'
-LOCAL_MODEL_PATH = f'mlx_models/{HF_MODEL_PATH}'
-os.environ['HF_MODEL_PATH'] = HF_MODEL_PATH
-os.environ['LOCAL_MODEL_PATH'] = LOCAL_MODEL_PATH
-print(f"""
-HF_MODEL_PATH={HF_MODEL_PATH}
-LOCAL_MODEL_PATH={LOCAL_MODEL_PATH}
-""".strip())
-```
-
-%% Output
-
-    HF_MODEL_PATH=mistralai/Mistral-7B-Instruct-v0.2
-    LOCAL_MODEL_PATH=mlx_models/mistralai/Mistral-7B-Instruct-v0.2
-
-%% Cell type:markdown id:a52bdff5b0eae3bd tags:
-
-### Create a 4-Bit quantized model if necessary
-
-%% Cell type:code id:fdb9ec6772be0c23 tags:
-
-``` python
-![ -d "$LOCAL_MODEL_PATH" ] || python convert.py --hf-path "$HF_MODEL_PATH" --mlx-path "$LOCAL_MODEL_PATH" -q
-```
-
 %% Cell type:markdown id:8c46d1d132de28c3 tags:

 ### Finetuning

 %% Cell type:code id:fd1a48e84474aaea tags:

 ``` python
 !python lora.py --train \
    --model "$LOCAL_MODEL_PATH" \
    --data data/editors/mistral \
    --adapter-file "$LOCAL_MODEL_PATH/editors.npz" \
    --iters 600 --batch-size 1 --lora-layers 4
 ```

 %% Cell type:markdown id:4945c07efbb3b4e8 tags:

 To run in a separate shell:

 %% Cell type:code id:dc9af052b1e9a9e4 tags:

 ``` python
 print(f"""
 cd mlx/lora
 python lora.py --train \\
    --model {LOCAL_MODEL_PATH} \\
    --data data/editors/mistral \\
    --adapter-file {LOCAL_MODEL_PATH}/editors.npz \\
    --iters 600 --batch-size 1 --lora-layers 4
 """.strip())
 ```

 %% Output

    cd mlx/lora
    python lora.py --train \
        --model mlx_models/mistralai/Mistral-7B-Instruct-v0.2 \
        --data data/editors/mistral \
        --adapter-file mlx_models/mistralai/Mistral-7B-Instruct-v0.2/editors.npz \
        --iters 600 --batch-size 1 --lora-layers 4

 %% Cell type:markdown id:2f3bb7b9404da7e7 tags:

 Training loss: ~0.8, ~90 Tokens/sec

 %% Cell type:markdown id:27ec240d6a886b16 tags:

 ### Test the model with adapter

 %% Cell type:code id:a66ab3a823260361 tags:

 ``` python
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 !python lora.py --test \
    --model mlx_models/mistralai/Mistral-7B-Instruct-v0.2 \
    --data data/editors/mistral \
    --adapter-file mlx_models/mistralai/Mistral-7B-Instruct-v0.2/editors.npz
 ```

 %% Output

    python(39031) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.

    Testing
    Test loss 0.800, Test ppl 2.226.

 %% Cell type:markdown id:3bab8168bd116d38 tags:

 Result:
 600 iters: Test loss 0.800, Test ppl 2.226

 %% Cell type:markdown id:c7e42a5574068ba9 tags:

 ### Manual test prompt

 %% Cell type:code id:8d316a1e7570f1d4 tags:

 ``` python
 prompt=f"""
 ### SYSTEM
 {system_message}
 ### USER
 {instruction}
 {example}
 ### CONTENT
 {test_data}
 ### END OF CONTENT
 """.strip()
 ```

 %% Cell type:code id:3e7a823a9f4a35d9 tags:

 ``` python
 print(prompt)
 ```

 %% Output

    ### SYSTEM
    
    You are a text processing agent. As instructed below, extract information from the provided content in a structured format without discussing reasoning or providing commentary. Only use source text given as input for data extraction unless specifically asked for inference.
    
    ### USER
    
    Analyze content from a German law journal's website. Your task is to identify members of the editorial board (terms to look for: 'Herausgeber', 'Redakteur', 'Schriftleitung') and the advisory board ('Beirat'). For each identified member, extract and organize their information into the following categories: lastname, firstname, title (including academic titles like 'Dr.' or 'Prof. Dr.' and suffixes such as 'LL.M.'), position (their job title, if provided), affiliation, and role. For 'role', infer the role within the journal from the context (options 'Herausgeber', 'Redaktion', 'Schriftleitung', 'Beirat', or an empty string if the role is unknown).
    
    - Format the output as a YAML list of dictionaries.
    - Exclude any dictionary entries for which information is not available or relevant fields are empty.
    - Ensure the YAML output is strictly valid. It must be a list of dictionaries.
    
    
    Here is an example:
    
    ```yaml
    - lastname: Mustermann
      firstname: Martina
      title: Dr.
      position: Vorsitzender Richterin
      affiliation: Oberlandesgericht Buxtehude
      role: Herausgeber
    ```
    
    ### CONTENT
    
    Herausgeber:
    Prof. Dr. Stefan Knesebeck, Universität Wuppertal
    Prof. Dr. Dr. h.c. Fritz M. Müller LL.M.(Yale), Universität Wanne-Eickel
    RA Prof. Dr. Vera Valentin, Hochschule für Recht und Sport Edingen
    Prof. Dr. Dr. h.c. Rita Rosenbaum, Universität Tupfingen
    Dr. Ingo Gonzalo de Sanchez, Vorsitzender Richter am Oberlandesgericht Rostock
    Redaktion:
    RA Adam Gengelbach, Unterhachingen
    Ass. iur. Petra Priem, Herrenchiemsee
    
    ### END OF CONTENT

 %% Cell type:code id:1ea4b39f35c09268 tags:

 ``` python
 import os
 import time
 os.environ['LLM_PROMPT'] = prompt
 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 start_time = time.time()
 !python lora.py \
    --model mlx_models/mistralai/Mistral-7B-Instruct-v0.2 \
    --adapter-file mlx_models/mistralai/Mistral-7B-Instruct-v0.2/editors.npz \
    --max-tokens 400 \
    --temp 0 \
    --prompt "$LLM_PROMPT"
 print(f'Generation took {time.time() - start_time} seconds')
 ```

 %% Output

    python(39255) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.

    
    
    - lastname: Knesebeck
      firstname: Stefan
      title: Prof. Dr.
      position: Universität Wuppertal
      affiliation: Universität Wuppertal
      role: Herausgeber
    - lastname: Müller
      firstname: Fritz M.
      title: Prof. Dr. Dr. h.c. LL.M.(Yale)
      position: Universität Wanne-Eickel
      affiliation: Universität Wanne-Eickel
      role: Herausgeber
    - lastname: Valentin
      firstname: Vera
      title: Prof. Dr.
      position: Hochschule für Recht und Sport Edingen
      affiliation: Hochschule für Recht und Sport Edingen
      role: Redaktion
    - lastname: Rosenbaum
      firstname: Rita
      title: Prof. Dr. Dr. h.c.
      position: Universität Tupfingen
      affiliation: Universität Tupfingen
      role: Herausgeber
    - lastname: Gonzalo de Sanchez
      firstname: Ingo
      title: Dr.
      position: Vorsitzender Richter am Oberlandesgericht Rostock
      affiliation: Oberlandesgericht Rostock
      role: Herausgeber
    - lastname: Gengelbach
      firstname: Adam
      title: RA
      position: Unterhachingen
      affiliation: Unterhachingen
      role: Redaktion
    - lastname: Priem
      firstname: Petra
      title: Ass. iur.
      position: Herrenchiemsee
      affiliation: Herrenchiemsee
      role: Redaktion
    Generation took 87.13131785392761 seconds

 %% Cell type:markdown id:d1b0c8c8648906b7 tags:

 ## mlx-community/quantized-gemma-7b-it

 This model can be directly downloaded from HF, no conversion necessary

 %% Cell type:markdown id:7c5659b8c268e72f tags:

 ### Zero-shot

 %% Cell type:code id:89e1a05fc3b6e435 tags:

 ``` python
 from mlx_lm import load, generate
 import time

 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 prompt = f"""
 #### instructions
 {system_message}
 ### user
 {instruction}
 {example}
 {epilog}

 {test_data}

 """.strip()

 model, tokenizer = load("mlx-community/quantized-gemma-7b-it")
 start_time = time.time()
 response = generate(model, tokenizer, prompt=prompt, verbose=False, max_tokens=300, temp=0)
 print(response)
 print(f'Generation took {time.time() - start_time} seconds')
 ```

 %% Output


    
    Schriftleitung:
    Dr. Martin Schmidt, Berlin
    Beirat:
    Dr. Hans-Peter Kaulitz, Berlin
    Dr. Franz-Josef Schmidt, München
    
    ```
    
    **Expected Output:**
    
    ```yaml
    - lastname: Knesebeck
      firstname: Stefan
      title: Prof. Dr.
      position: N/A
      affiliation: Universität Wuppertal
      role: Herausgeber
    
    - lastname: Müller
      firstname: Fritz M.
      title: Prof. Dr. Dr. h.c. LL.M.(Yale)
      position: N/A
      affiliation: Universität Wanne-Eickel
      role: Herausgeber
    
    - lastname: Valentin
      firstname: Vera
      title: RA Prof. Dr.
      position: N/A
      affiliation: Hochschule für Recht und Sport Edingen
      role: N/A
    
    - lastname: Rosenbaum
      firstname: Rita
      title: Prof. Dr. Dr. h.c.
      position: N/A
      affiliation: Universität Tupfingen
      role: N/A
    
    - lastname: Gonzalo de Sanchez
      firstname: Ingo
      title: Dr.
      position: Vorsitzender Richter am Oberlandesgericht Rostock
      affiliation: Oberlandesgericht Rostock
      role: N/A
    
    - lastname: Gengelbach
      firstname: Adam
      title: RA
      position: N/A
    Generation took 50.564462184906006 seconds

 %% Cell type:markdown id:e48938d56b99848c tags:

 ### Generate training, testing and validation files

 based on https://gist.github.com/alexweberk/635431b5c5773efd6d1755801020429f

 %% Cell type:code id:8d61e8cf63aa5965 tags:

 ``` python
 from lib.prepare_training_data import create_training_file

 prompt = f"""
 # instructions
 {system_message}
 # user
 {instruction}
 {epilog}'
 """.strip()

 def template_fn(prompt: str, answer: str):
    return f'<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n{answer}<end_of_turn><eos>'

 create_training_file(instruction=instruction,
                     template_func=template_fn,
                     input_file='data/editors/editors.csv',
                     output_dir='data/editors-gemma',
                     content_dir='data/editors/website-data',
                     max_chars=6000, max_gt_items=5,
                     record_identifier_col="journal_abbr",
                     cols_to_remove=['journal_abbr', 'website', 'retrieved_on'],
                     column_to_filter_by='lastname',
                     lines_before=2, lines_after=2)
 ```

 %% Output

    Length of generated sequences:
     - max: 5107
     - avg: 1976.0964912280701
    Longest sequences:
    FoR: 5107
    DivRuW: 5097
    AfP: 4519
    StAZ: 4418
    DÖD: 4220
    ECFR: 3519
    APR: 3445
    CB: 3387
    AuA: 3317
    HRN: 3128

 %% Cell type:code id:db51ef32ff18dff3 tags:

 ``` python
 ```