From 95b65ae71598f43d071d5f4dea8ba98bfcaeb81e Mon Sep 17 00:00:00 2001
From: Christian Boulanger <boulanger@lhlt.mpg.de>
Date: Tue, 5 Mar 2024 10:50:07 +0100
Subject: [PATCH] add gitignore

---
 cuda/.gitignore             |   1 +
 cuda/gemma-finetuning.ipynb | 660 +++++++++++++++++++++++++++++++++---
 2 files changed, 621 insertions(+), 40 deletions(-)
 create mode 100644 cuda/.gitignore

diff --git a/cuda/.gitignore b/cuda/.gitignore
new file mode 100644
index 0000000..2eea525
--- /dev/null
+++ b/cuda/.gitignore
@@ -0,0 +1 @@
+.env
\ No newline at end of file
diff --git a/cuda/gemma-finetuning.ipynb b/cuda/gemma-finetuning.ipynb
index deaf6cd..41e33f0 100644
--- a/cuda/gemma-finetuning.ipynb
+++ b/cuda/gemma-finetuning.ipynb
@@ -20,12 +20,605 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "13bfb820-9615-48bd-96a9-5f454f1e67a9",
    "metadata": {
-    "tags": []
+    "tags": [],
+    "ExecuteTime": {
+     "end_time": "2024-03-05T08:53:33.277048200Z",
+     "start_time": "2024-03-05T08:48:47.405777Z"
+    }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting transformers\n",
+      "  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)\n",
+      "     ---------------------------------------- 0.0/130.7 kB ? eta -:--:--\n",
+      "     --- ------------------------------------ 10.2/130.7 kB ? eta -:--:--\n",
+      "     --------------------------- ----------- 92.2/130.7 kB 1.3 MB/s eta 0:00:01\n",
+      "     -------------------------------------- 130.7/130.7 kB 1.3 MB/s eta 0:00:00\n",
+      "Collecting datasets\n",
+      "  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)\n",
+      "Collecting peft\n",
+      "  Downloading peft-0.9.0-py3-none-any.whl.metadata (13 kB)\n",
+      "Requirement already satisfied: python-dotenv in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (0.21.0)\n",
+      "Collecting accelerate\n",
+      "  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)\n",
+      "Collecting trl\n",
+      "  Downloading trl-0.7.11-py3-none-any.whl.metadata (10 kB)\n",
+      "Requirement already satisfied: filelock in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from transformers) (3.13.1)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from transformers) (0.20.2)\n",
+      "Requirement already satisfied: numpy>=1.17 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from transformers) (1.26.3)\n",
+      "Requirement already satisfied: packaging>=20.0 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from transformers) (23.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from transformers) (6.0.1)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from transformers) (2023.12.25)\n",
+      "Requirement already satisfied: requests in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from transformers) (2.31.0)\n",
+      "Collecting tokenizers<0.19,>=0.14 (from transformers)\n",
+      "  Downloading tokenizers-0.15.2-cp311-none-win_amd64.whl.metadata (6.8 kB)\n",
+      "Collecting safetensors>=0.4.1 (from transformers)\n",
+      "  Downloading safetensors-0.4.2-cp311-none-win_amd64.whl.metadata (3.9 kB)\n",
+      "Requirement already satisfied: tqdm>=4.27 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from transformers) (4.65.0)\n",
+      "Collecting pyarrow>=12.0.0 (from datasets)\n",
+      "  Downloading pyarrow-15.0.0-cp311-cp311-win_amd64.whl.metadata (3.1 kB)\n",
+      "Collecting pyarrow-hotfix (from datasets)\n",
+      "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n",
+      "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
+      "  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
+      "Requirement already satisfied: pandas in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from datasets) (2.1.4)\n",
+      "Collecting xxhash (from datasets)\n",
+      "  Downloading xxhash-3.4.1-cp311-cp311-win_amd64.whl.metadata (12 kB)\n",
+      "Collecting multiprocess (from datasets)\n",
+      "  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)\n",
+      "Requirement already satisfied: fsspec<=2024.2.0,>=2023.1.0 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from fsspec[http]<=2024.2.0,>=2023.1.0->datasets) (2023.12.2)\n",
+      "Requirement already satisfied: aiohttp in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from datasets) (3.9.1)\n",
+      "Requirement already satisfied: psutil in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from peft) (5.9.0)\n",
+      "Collecting torch>=1.13.0 (from peft)\n",
+      "  Downloading torch-2.2.1-cp311-cp311-win_amd64.whl.metadata (26 kB)\n",
+      "Collecting tyro>=0.5.11 (from trl)\n",
+      "  Downloading tyro-0.7.3-py3-none-any.whl.metadata (7.7 kB)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from aiohttp->datasets) (23.1.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from aiohttp->datasets) (6.0.4)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from aiohttp->datasets) (1.9.4)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from aiohttp->datasets) (1.4.1)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from aiohttp->datasets) (1.3.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from huggingface-hub<1.0,>=0.19.3->transformers) (4.9.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from requests->transformers) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from requests->transformers) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from requests->transformers) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from requests->transformers) (2023.11.17)\n",
+      "Collecting sympy (from torch>=1.13.0->peft)\n",
+      "  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)\n",
+      "Requirement already satisfied: networkx in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from torch>=1.13.0->peft) (3.2.1)\n",
+      "Requirement already satisfied: jinja2 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from torch>=1.13.0->peft) (3.1.2)\n",
+      "Requirement already satisfied: colorama in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from tqdm>=4.27->transformers) (0.4.6)\n",
+      "Collecting docstring-parser>=0.14.1 (from tyro>=0.5.11->trl)\n",
+      "  Downloading docstring_parser-0.15-py3-none-any.whl.metadata (2.4 kB)\n",
+      "Requirement already satisfied: rich>=11.1.0 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from tyro>=0.5.11->trl) (13.7.0)\n",
+      "Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)\n",
+      "  Downloading shtab-1.7.0-py3-none-any.whl.metadata (7.3 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from pandas->datasets) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from pandas->datasets) (2023.3.post1)\n",
+      "Requirement already satisfied: tzdata>=2022.1 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from pandas->datasets) (2023.3)\n",
+      "Requirement already satisfied: six>=1.5 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+      "Requirement already satisfied: markdown-it-py>=2.2.0 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from rich>=11.1.0->tyro>=0.5.11->trl) (3.0.0)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from rich>=11.1.0->tyro>=0.5.11->trl) (2.15.1)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from jinja2->torch>=1.13.0->peft) (2.1.3)\n",
+      "Collecting mpmath>=0.19 (from sympy->torch>=1.13.0->peft)\n",
+      "  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)\n",
+      "Requirement already satisfied: mdurl~=0.1 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from markdown-it-py>=2.2.0->rich>=11.1.0->tyro>=0.5.11->trl) (0.1.2)\n",
+      "Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)\n",
+      "   ---------------------------------------- 0.0/8.5 MB ? eta -:--:--\n",
+      "   -- ------------------------------------- 0.6/8.5 MB 11.8 MB/s eta 0:00:01\n",
+      "   ------ --------------------------------- 1.5/8.5 MB 18.8 MB/s eta 0:00:01\n",
+      "   ------------ --------------------------- 2.6/8.5 MB 20.6 MB/s eta 0:00:01\n",
+      "   ----------------- ---------------------- 3.7/8.5 MB 23.4 MB/s eta 0:00:01\n",
+      "   ---------------------- ----------------- 4.8/8.5 MB 23.4 MB/s eta 0:00:01\n",
+      "   --------------------------- ------------ 5.8/8.5 MB 23.3 MB/s eta 0:00:01\n",
+      "   -------------------------------- ------- 7.0/8.5 MB 24.8 MB/s eta 0:00:01\n",
+      "   ------------------------------------- -- 8.0/8.5 MB 24.5 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  8.5/8.5 MB 24.9 MB/s eta 0:00:01\n",
+      "   ---------------------------------------- 8.5/8.5 MB 21.0 MB/s eta 0:00:00\n",
+      "Downloading datasets-2.18.0-py3-none-any.whl (510 kB)\n",
+      "   ---------------------------------------- 0.0/510.5 kB ? eta -:--:--\n",
+      "   -------------------------------- ------ 419.8/510.5 kB 12.8 MB/s eta 0:00:01\n",
+      "   -------------------------------------- - 491.5/510.5 kB 5.1 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  501.8/510.5 kB 4.5 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  501.8/510.5 kB 4.5 MB/s eta 0:00:01\n",
+      "   ---------------------------------------- 510.5/510.5 kB 2.7 MB/s eta 0:00:00\n",
+      "Downloading peft-0.9.0-py3-none-any.whl (190 kB)\n",
+      "   ---------------------------------------- 0.0/190.9 kB ? eta -:--:--\n",
+      "   -------------------------------------- - 184.3/190.9 kB ? eta -:--:--\n",
+      "   ---------------------------------------- 190.9/190.9 kB 5.6 MB/s eta 0:00:00\n",
+      "Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)\n",
+      "   ---------------------------------------- 0.0/280.0 kB ? eta -:--:--\n",
+      "   ---------------------------------------- 280.0/280.0 kB 8.7 MB/s eta 0:00:00\n",
+      "Downloading trl-0.7.11-py3-none-any.whl (155 kB)\n",
+      "   ---------------------------------------- 0.0/155.3 kB ? eta -:--:--\n",
+      "   ---------------------------------------- 155.3/155.3 kB 4.7 MB/s eta 0:00:00\n",
+      "Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
+      "   ---------------------------------------- 0.0/116.3 kB ? eta -:--:--\n",
+      "   ---------------------------------------- 116.3/116.3 kB 2.3 MB/s eta 0:00:00\n",
+      "Downloading pyarrow-15.0.0-cp311-cp311-win_amd64.whl (24.8 MB)\n",
+      "   ---------------------------------------- 0.0/24.8 MB ? eta -:--:--\n",
+      "   -- ------------------------------------- 1.5/24.8 MB 31.6 MB/s eta 0:00:01\n",
+      "   ---- ----------------------------------- 2.5/24.8 MB 27.0 MB/s eta 0:00:01\n",
+      "   ----- ---------------------------------- 3.6/24.8 MB 28.8 MB/s eta 0:00:01\n",
+      "   ------- -------------------------------- 4.7/24.8 MB 27.2 MB/s eta 0:00:01\n",
+      "   --------- ------------------------------ 5.8/24.8 MB 28.4 MB/s eta 0:00:01\n",
+      "   ---------- ----------------------------- 6.8/24.8 MB 27.1 MB/s eta 0:00:01\n",
+      "   ------------ --------------------------- 7.9/24.8 MB 26.6 MB/s eta 0:00:01\n",
+      "   -------------- ------------------------- 9.0/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   ---------------- ----------------------- 10.0/24.8 MB 26.7 MB/s eta 0:00:01\n",
+      "   ----------------- ---------------------- 11.1/24.8 MB 27.3 MB/s eta 0:00:01\n",
+      "   ------------------- -------------------- 12.1/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   --------------------- ------------------ 13.1/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   ---------------------- ----------------- 14.1/24.8 MB 27.3 MB/s eta 0:00:01\n",
+      "   ------------------------ --------------- 15.3/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   -------------------------- ------------- 16.3/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   --------------------------- ------------ 17.3/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   ----------------------------- ---------- 18.4/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   ------------------------------- -------- 19.4/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   -------------------------------- ------- 20.4/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   ---------------------------------- ----- 21.4/24.8 MB 26.2 MB/s eta 0:00:01\n",
+      "   ------------------------------------ --- 22.4/24.8 MB 25.2 MB/s eta 0:00:01\n",
+      "   ------------------------------------- -- 23.3/24.8 MB 25.2 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  24.3/24.8 MB 25.2 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  24.8/24.8 MB 25.2 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  24.8/24.8 MB 25.2 MB/s eta 0:00:01\n",
+      "   ---------------------------------------- 24.8/24.8 MB 20.4 MB/s eta 0:00:00\n",
+      "Downloading safetensors-0.4.2-cp311-none-win_amd64.whl (269 kB)\n",
+      "   ---------------------------------------- 0.0/269.6 kB ? eta -:--:--\n",
+      "   ---------------------------------------- 269.6/269.6 kB 8.4 MB/s eta 0:00:00\n",
+      "Downloading tokenizers-0.15.2-cp311-none-win_amd64.whl (2.2 MB)\n",
+      "   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n",
+      "   ----------------------- ---------------- 1.3/2.2 MB 40.6 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  2.2/2.2 MB 28.0 MB/s eta 0:00:01\n",
+      "   ---------------------------------------- 2.2/2.2 MB 20.0 MB/s eta 0:00:00\n",
+      "Downloading torch-2.2.1-cp311-cp311-win_amd64.whl (198.6 MB)\n",
+      "   ---------------------------------------- 0.0/198.6 MB ? eta -:--:--\n",
+      "   ---------------------------------------- 1.1/198.6 MB 35.3 MB/s eta 0:00:06\n",
+      "   ---------------------------------------- 2.1/198.6 MB 33.0 MB/s eta 0:00:06\n",
+      "    --------------------------------------- 3.1/198.6 MB 28.0 MB/s eta 0:00:07\n",
+      "    --------------------------------------- 4.0/198.6 MB 28.5 MB/s eta 0:00:07\n",
+      "   - -------------------------------------- 5.0/198.6 MB 26.8 MB/s eta 0:00:08\n",
+      "   - -------------------------------------- 6.0/198.6 MB 25.4 MB/s eta 0:00:08\n",
+      "   - -------------------------------------- 7.0/198.6 MB 24.8 MB/s eta 0:00:08\n",
+      "   - -------------------------------------- 7.9/198.6 MB 25.3 MB/s eta 0:00:08\n",
+      "   - -------------------------------------- 9.0/198.6 MB 26.1 MB/s eta 0:00:08\n",
+      "   -- ------------------------------------- 10.1/198.6 MB 25.9 MB/s eta 0:00:08\n",
+      "   -- ------------------------------------- 11.1/198.6 MB 25.2 MB/s eta 0:00:08\n",
+      "   -- ------------------------------------- 12.0/198.6 MB 24.2 MB/s eta 0:00:08\n",
+      "   -- ------------------------------------- 13.0/198.6 MB 25.2 MB/s eta 0:00:08\n",
+      "   -- ------------------------------------- 14.1/198.6 MB 24.2 MB/s eta 0:00:08\n",
+      "   --- ------------------------------------ 15.2/198.6 MB 25.1 MB/s eta 0:00:08\n",
+      "   --- ------------------------------------ 16.1/198.6 MB 25.2 MB/s eta 0:00:08\n",
+      "   --- ------------------------------------ 17.1/198.6 MB 25.1 MB/s eta 0:00:08\n",
+      "   --- ------------------------------------ 18.2/198.6 MB 25.1 MB/s eta 0:00:08\n",
+      "   --- ------------------------------------ 19.1/198.6 MB 25.2 MB/s eta 0:00:08\n",
+      "   ---- ----------------------------------- 20.2/198.6 MB 25.1 MB/s eta 0:00:08\n",
+      "   ---- ----------------------------------- 21.1/198.6 MB 24.2 MB/s eta 0:00:08\n",
+      "   ---- ----------------------------------- 22.3/198.6 MB 26.2 MB/s eta 0:00:07\n",
+      "   ---- ----------------------------------- 23.4/198.6 MB 25.1 MB/s eta 0:00:07\n",
+      "   ---- ----------------------------------- 24.5/198.6 MB 25.1 MB/s eta 0:00:07\n",
+      "   ----- ---------------------------------- 25.4/198.6 MB 25.2 MB/s eta 0:00:07\n",
+      "   ----- ---------------------------------- 26.3/198.6 MB 25.1 MB/s eta 0:00:07\n",
+      "   ----- ---------------------------------- 27.4/198.6 MB 26.2 MB/s eta 0:00:07\n",
+      "   ----- ---------------------------------- 28.4/198.6 MB 25.2 MB/s eta 0:00:07\n",
+      "   ----- ---------------------------------- 29.5/198.6 MB 26.2 MB/s eta 0:00:07\n",
+      "   ------ --------------------------------- 30.6/198.6 MB 26.2 MB/s eta 0:00:07\n",
+      "   ------ --------------------------------- 31.7/198.6 MB 26.2 MB/s eta 0:00:07\n",
+      "   ------ --------------------------------- 32.8/198.6 MB 26.2 MB/s eta 0:00:07\n",
+      "   ------ --------------------------------- 33.9/198.6 MB 25.2 MB/s eta 0:00:07\n",
+      "   ------- -------------------------------- 34.9/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   ------- -------------------------------- 35.9/198.6 MB 25.1 MB/s eta 0:00:07\n",
+      "   ------- -------------------------------- 37.0/198.6 MB 26.2 MB/s eta 0:00:07\n",
+      "   ------- -------------------------------- 38.0/198.6 MB 26.2 MB/s eta 0:00:07\n",
+      "   ------- -------------------------------- 39.1/198.6 MB 26.2 MB/s eta 0:00:07\n",
+      "   -------- ------------------------------- 40.2/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   -------- ------------------------------- 41.3/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   -------- ------------------------------- 42.4/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   -------- ------------------------------- 43.5/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   -------- ------------------------------- 44.6/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   --------- ------------------------------ 45.7/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   --------- ------------------------------ 46.8/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   --------- ------------------------------ 47.9/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   --------- ------------------------------ 49.0/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   ---------- ----------------------------- 50.0/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   ---------- ----------------------------- 51.1/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   ---------- ----------------------------- 52.1/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   ---------- ----------------------------- 53.2/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   ---------- ----------------------------- 54.3/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   ----------- ---------------------------- 55.3/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   ----------- ---------------------------- 56.5/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   ----------- ---------------------------- 57.5/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   ----------- ---------------------------- 58.6/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   ------------ --------------------------- 59.7/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   ------------ --------------------------- 60.7/198.6 MB 26.2 MB/s eta 0:00:06\n",
+      "   ------------ --------------------------- 61.8/198.6 MB 27.3 MB/s eta 0:00:06\n",
+      "   ------------ --------------------------- 62.9/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ------------ --------------------------- 64.0/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ------------- -------------------------- 65.1/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ------------- -------------------------- 66.2/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ------------- -------------------------- 67.3/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ------------- -------------------------- 68.4/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ------------- -------------------------- 69.4/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   -------------- ------------------------- 70.5/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   -------------- ------------------------- 71.6/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   -------------- ------------------------- 72.7/198.6 MB 26.2 MB/s eta 0:00:05\n",
+      "   -------------- ------------------------- 73.8/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   --------------- ------------------------ 74.9/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   --------------- ------------------------ 76.0/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   --------------- ------------------------ 77.1/198.6 MB 26.2 MB/s eta 0:00:05\n",
+      "   --------------- ------------------------ 78.2/198.6 MB 26.2 MB/s eta 0:00:05\n",
+      "   --------------- ------------------------ 79.3/198.6 MB 26.2 MB/s eta 0:00:05\n",
+      "   ---------------- ----------------------- 80.4/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ---------------- ----------------------- 81.5/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ---------------- ----------------------- 82.6/198.6 MB 26.2 MB/s eta 0:00:05\n",
+      "   ---------------- ----------------------- 83.6/198.6 MB 26.2 MB/s eta 0:00:05\n",
+      "   ----------------- ---------------------- 84.7/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ----------------- ---------------------- 85.8/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ----------------- ---------------------- 86.9/198.6 MB 28.5 MB/s eta 0:00:04\n",
+      "   ----------------- ---------------------- 88.0/198.6 MB 27.3 MB/s eta 0:00:05\n",
+      "   ----------------- ---------------------- 89.1/198.6 MB 26.2 MB/s eta 0:00:05\n",
+      "   ------------------ --------------------- 90.1/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ------------------ --------------------- 91.3/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ------------------ --------------------- 92.3/198.6 MB 26.2 MB/s eta 0:00:05\n",
+      "   ------------------ --------------------- 93.4/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ------------------- -------------------- 94.5/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ------------------- -------------------- 95.4/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ------------------- -------------------- 96.4/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ------------------- -------------------- 97.5/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ------------------- -------------------- 98.6/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   -------------------- ------------------- 99.7/198.6 MB 26.2 MB/s eta 0:00:04\n",
+      "   ------------------- ------------------- 100.8/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   -------------------- ------------------ 101.9/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   -------------------- ------------------ 103.0/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   -------------------- ------------------ 104.0/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   -------------------- ------------------ 105.1/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   -------------------- ------------------ 106.1/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   --------------------- ----------------- 107.3/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   --------------------- ----------------- 108.4/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   --------------------- ----------------- 109.5/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   --------------------- ----------------- 110.6/198.6 MB 26.2 MB/s eta 0:00:04\n",
+      "   --------------------- ----------------- 111.7/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ---------------------- ---------------- 112.7/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ---------------------- ---------------- 113.8/198.6 MB 27.3 MB/s eta 0:00:04\n",
+      "   ---------------------- ---------------- 114.9/198.6 MB 26.2 MB/s eta 0:00:04\n",
+      "   ---------------------- ---------------- 116.0/198.6 MB 26.2 MB/s eta 0:00:04\n",
+      "   ---------------------- ---------------- 117.1/198.6 MB 27.3 MB/s eta 0:00:03\n",
+      "   ----------------------- --------------- 118.1/198.6 MB 27.3 MB/s eta 0:00:03\n",
+      "   ----------------------- --------------- 119.2/198.6 MB 27.3 MB/s eta 0:00:03\n",
+      "   ----------------------- --------------- 120.3/198.6 MB 26.2 MB/s eta 0:00:03\n",
+      "   ----------------------- --------------- 121.4/198.6 MB 27.3 MB/s eta 0:00:03\n",
+      "   ------------------------ -------------- 122.4/198.6 MB 26.2 MB/s eta 0:00:03\n",
+      "   ------------------------ -------------- 123.2/198.6 MB 27.3 MB/s eta 0:00:03\n",
+      "   ------------------------ -------------- 124.3/198.6 MB 25.2 MB/s eta 0:00:03\n",
+      "   ------------------------ -------------- 125.3/198.6 MB 25.2 MB/s eta 0:00:03\n",
+      "   ------------------------ -------------- 126.3/198.6 MB 26.2 MB/s eta 0:00:03\n",
+      "   ------------------------- ------------- 127.3/198.6 MB 25.2 MB/s eta 0:00:03\n",
+      "   ------------------------- ------------- 128.2/198.6 MB 24.2 MB/s eta 0:00:03\n",
+      "   ------------------------- ------------- 129.4/198.6 MB 25.2 MB/s eta 0:00:03\n",
+      "   ------------------------- ------------- 130.3/198.6 MB 25.2 MB/s eta 0:00:03\n",
+      "   ------------------------- ------------- 131.2/198.6 MB 24.3 MB/s eta 0:00:03\n",
+      "   ------------------------- ------------- 132.2/198.6 MB 24.2 MB/s eta 0:00:03\n",
+      "   -------------------------- ------------ 132.8/198.6 MB 23.4 MB/s eta 0:00:03\n",
+      "   -------------------------- ------------ 133.7/198.6 MB 22.6 MB/s eta 0:00:03\n",
+      "   -------------------------- ------------ 134.6/198.6 MB 23.4 MB/s eta 0:00:03\n",
+      "   -------------------------- ------------ 135.6/198.6 MB 22.6 MB/s eta 0:00:03\n",
+      "   -------------------------- ------------ 136.6/198.6 MB 23.4 MB/s eta 0:00:03\n",
+      "   --------------------------- ----------- 137.7/198.6 MB 22.6 MB/s eta 0:00:03\n",
+      "   --------------------------- ----------- 138.7/198.6 MB 23.4 MB/s eta 0:00:03\n",
+      "   --------------------------- ----------- 139.8/198.6 MB 23.4 MB/s eta 0:00:03\n",
+      "   --------------------------- ----------- 140.9/198.6 MB 23.4 MB/s eta 0:00:03\n",
+      "   --------------------------- ----------- 142.0/198.6 MB 24.2 MB/s eta 0:00:03\n",
+      "   ---------------------------- ---------- 143.0/198.6 MB 24.3 MB/s eta 0:00:03\n",
+      "   ---------------------------- ---------- 144.3/198.6 MB 25.2 MB/s eta 0:00:03\n",
+      "   ---------------------------- ---------- 145.4/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   ---------------------------- ---------- 146.4/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   ---------------------------- ---------- 147.5/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   ----------------------------- --------- 148.6/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   ----------------------------- --------- 149.7/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   ----------------------------- --------- 150.8/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   ----------------------------- --------- 151.8/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   ------------------------------ -------- 153.0/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   ------------------------------ -------- 154.0/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   ------------------------------ -------- 155.1/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   ------------------------------ -------- 156.2/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   ------------------------------ -------- 157.2/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   ------------------------------- ------- 158.4/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   ------------------------------- ------- 159.5/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   ------------------------------- ------- 160.8/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   ------------------------------- ------- 161.9/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   -------------------------------- ------ 163.0/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   -------------------------------- ------ 164.1/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   -------------------------------- ------ 165.2/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   -------------------------------- ------ 166.3/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   -------------------------------- ------ 167.3/198.6 MB 27.3 MB/s eta 0:00:02\n",
+      "   --------------------------------- ----- 168.3/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   --------------------------------- ----- 169.2/198.6 MB 26.2 MB/s eta 0:00:02\n",
+      "   --------------------------------- ----- 170.0/198.6 MB 25.1 MB/s eta 0:00:02\n",
+      "   --------------------------------- ----- 170.8/198.6 MB 25.2 MB/s eta 0:00:02\n",
+      "   --------------------------------- ----- 171.4/198.6 MB 24.2 MB/s eta 0:00:02\n",
+      "   --------------------------------- ----- 172.0/198.6 MB 22.6 MB/s eta 0:00:02\n",
+      "   --------------------------------- ----- 172.8/198.6 MB 22.6 MB/s eta 0:00:02\n",
+      "   ---------------------------------- ---- 173.5/198.6 MB 21.1 MB/s eta 0:00:02\n",
+      "   ---------------------------------- ---- 174.3/198.6 MB 21.1 MB/s eta 0:00:02\n",
+      "   ---------------------------------- ---- 175.2/198.6 MB 21.1 MB/s eta 0:00:02\n",
+      "   ---------------------------------- ---- 176.0/198.6 MB 20.5 MB/s eta 0:00:02\n",
+      "   ---------------------------------- ---- 176.8/198.6 MB 20.5 MB/s eta 0:00:02\n",
+      "   ---------------------------------- ---- 177.6/198.6 MB 19.8 MB/s eta 0:00:02\n",
+      "   ----------------------------------- --- 178.4/198.6 MB 19.3 MB/s eta 0:00:02\n",
+      "   ----------------------------------- --- 179.2/198.6 MB 19.3 MB/s eta 0:00:02\n",
+      "   ----------------------------------- --- 180.1/198.6 MB 19.3 MB/s eta 0:00:01\n",
+      "   ----------------------------------- --- 180.9/198.6 MB 19.3 MB/s eta 0:00:01\n",
+      "   ----------------------------------- --- 181.7/198.6 MB 19.8 MB/s eta 0:00:01\n",
+      "   ----------------------------------- --- 182.5/198.6 MB 19.8 MB/s eta 0:00:01\n",
+      "   ----------------------------------- --- 183.3/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   ------------------------------------ -- 184.1/198.6 MB 19.8 MB/s eta 0:00:01\n",
+      "   ------------------------------------ -- 184.9/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   ------------------------------------ -- 185.7/198.6 MB 19.9 MB/s eta 0:00:01\n",
+      "   ------------------------------------ -- 186.5/198.6 MB 19.8 MB/s eta 0:00:01\n",
+      "   ------------------------------------ -- 187.2/198.6 MB 19.9 MB/s eta 0:00:01\n",
+      "   ------------------------------------ -- 188.1/198.6 MB 19.3 MB/s eta 0:00:01\n",
+      "   ------------------------------------- - 188.8/198.6 MB 19.9 MB/s eta 0:00:01\n",
+      "   ------------------------------------- - 189.6/198.6 MB 19.3 MB/s eta 0:00:01\n",
+      "   ------------------------------------- - 190.4/198.6 MB 19.8 MB/s eta 0:00:01\n",
+      "   ------------------------------------- - 191.1/198.6 MB 19.3 MB/s eta 0:00:01\n",
+      "   ------------------------------------- - 192.0/198.6 MB 19.9 MB/s eta 0:00:01\n",
+      "   ------------------------------------- - 192.8/198.6 MB 19.8 MB/s eta 0:00:01\n",
+      "   --------------------------------------  193.7/198.6 MB 19.3 MB/s eta 0:00:01\n",
+      "   --------------------------------------  194.6/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  195.4/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  196.3/198.6 MB 19.9 MB/s eta 0:00:01\n",
+      "   --------------------------------------  197.2/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  198.1/198.6 MB 19.9 MB/s eta 0:00:01\n",
+      "   --------------------------------------  198.6/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  198.6/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  198.6/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  198.6/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  198.6/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  198.6/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  198.6/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  198.6/198.6 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------- 198.6/198.6 MB 12.8 MB/s eta 0:00:00\n",
+      "Downloading tyro-0.7.3-py3-none-any.whl (79 kB)\n",
+      "   ---------------------------------------- 0.0/79.8 kB ? eta -:--:--\n",
+      "   ----------------------------------- ---- 71.7/79.8 kB ? eta -:--:--\n",
+      "   ---------------------------------------- 79.8/79.8 kB 2.2 MB/s eta 0:00:00\n",
+      "Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)\n",
+      "   ---------------------------------------- 0.0/143.5 kB ? eta -:--:--\n",
+      "   ---------------------------------------- 143.5/143.5 kB 4.3 MB/s eta 0:00:00\n",
+      "Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
+      "Downloading xxhash-3.4.1-cp311-cp311-win_amd64.whl (29 kB)\n",
+      "Downloading docstring_parser-0.15-py3-none-any.whl (36 kB)\n",
+      "Downloading shtab-1.7.0-py3-none-any.whl (14 kB)\n",
+      "Downloading sympy-1.12-py3-none-any.whl (5.7 MB)\n",
+      "   ---------------------------------------- 0.0/5.7 MB ? eta -:--:--\n",
+      "   ------ --------------------------------- 1.0/5.7 MB 20.9 MB/s eta 0:00:01\n",
+      "   ------------ --------------------------- 1.8/5.7 MB 23.5 MB/s eta 0:00:01\n",
+      "   ------------------- -------------------- 2.7/5.7 MB 19.3 MB/s eta 0:00:01\n",
+      "   ------------------------- -------------- 3.6/5.7 MB 20.8 MB/s eta 0:00:01\n",
+      "   ------------------------------ --------- 4.4/5.7 MB 20.1 MB/s eta 0:00:01\n",
+      "   ------------------------------------ --- 5.3/5.7 MB 19.8 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  5.7/5.7 MB 20.4 MB/s eta 0:00:01\n",
+      "   ---------------------------------------- 5.7/5.7 MB 18.4 MB/s eta 0:00:00\n",
+      "Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n",
+      "   ---------------------------------------- 0.0/536.2 kB ? eta -:--:--\n",
+      "   --------------------------------------  532.5/536.2 kB 32.6 MB/s eta 0:00:01\n",
+      "   ---------------------------------------- 536.2/536.2 kB 8.3 MB/s eta 0:00:00\n",
+      "Installing collected packages: mpmath, xxhash, sympy, shtab, safetensors, pyarrow-hotfix, pyarrow, docstring-parser, dill, torch, multiprocess, tyro, tokenizers, accelerate, transformers, datasets, trl, peft\n",
+      "Successfully installed accelerate-0.27.2 datasets-2.18.0 dill-0.3.8 docstring-parser-0.15 mpmath-1.3.0 multiprocess-0.70.16 peft-0.9.0 pyarrow-15.0.0 pyarrow-hotfix-0.6 safetensors-0.4.2 shtab-1.7.0 sympy-1.12 tokenizers-0.15.2 torch-2.2.1 transformers-4.38.2 trl-0.7.11 tyro-0.7.3 xxhash-3.4.1\n",
+      "Looking in indexes: https://pypi.org/simple/\n",
+      "Collecting bitsandbytes\n",
+      "  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)\n",
+      "Collecting scipy (from bitsandbytes)\n",
+      "  Downloading scipy-1.12.0-cp311-cp311-win_amd64.whl.metadata (60 kB)\n",
+      "     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--\n",
+      "     ------------------------- ------------ 41.0/60.4 kB 991.0 kB/s eta 0:00:01\n",
+      "     -------------------------------------- 60.4/60.4 kB 643.6 kB/s eta 0:00:00\n",
+      "Requirement already satisfied: numpy<1.29.0,>=1.22.4 in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from scipy->bitsandbytes) (1.26.3)\n",
+      "Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)\n",
+      "   ---------------------------------------- 0.0/105.0 MB ? eta -:--:--\n",
+      "   ---------------------------------------- 0.4/105.0 MB 8.1 MB/s eta 0:00:13\n",
+      "   ---------------------------------------- 1.2/105.0 MB 12.9 MB/s eta 0:00:09\n",
+      "    --------------------------------------- 2.3/105.0 MB 16.6 MB/s eta 0:00:07\n",
+      "   - -------------------------------------- 3.5/105.0 MB 18.5 MB/s eta 0:00:06\n",
+      "   - -------------------------------------- 4.8/105.0 MB 20.3 MB/s eta 0:00:05\n",
+      "   -- ------------------------------------- 6.0/105.0 MB 21.2 MB/s eta 0:00:05\n",
+      "   -- ------------------------------------- 7.2/105.0 MB 22.0 MB/s eta 0:00:05\n",
+      "   --- ------------------------------------ 8.5/105.0 MB 22.7 MB/s eta 0:00:05\n",
+      "   --- ------------------------------------ 9.7/105.0 MB 23.0 MB/s eta 0:00:05\n",
+      "   ---- ----------------------------------- 10.8/105.0 MB 25.1 MB/s eta 0:00:04\n",
+      "   ---- ----------------------------------- 11.1/105.0 MB 24.3 MB/s eta 0:00:04\n",
+      "   ---- ----------------------------------- 12.1/105.0 MB 23.4 MB/s eta 0:00:04\n",
+      "   ---- ----------------------------------- 13.1/105.0 MB 24.2 MB/s eta 0:00:04\n",
+      "   ----- ---------------------------------- 14.1/105.0 MB 23.4 MB/s eta 0:00:04\n",
+      "   ----- ---------------------------------- 15.2/105.0 MB 23.4 MB/s eta 0:00:04\n",
+      "   ------ --------------------------------- 16.6/105.0 MB 23.4 MB/s eta 0:00:04\n",
+      "   ------ --------------------------------- 17.7/105.0 MB 22.6 MB/s eta 0:00:04\n",
+      "   ------- -------------------------------- 18.8/105.0 MB 23.4 MB/s eta 0:00:04\n",
+      "   ------- -------------------------------- 19.9/105.0 MB 23.4 MB/s eta 0:00:04\n",
+      "   ------- -------------------------------- 21.0/105.0 MB 24.3 MB/s eta 0:00:04\n",
+      "   -------- ------------------------------- 22.1/105.0 MB 26.2 MB/s eta 0:00:04\n",
+      "   -------- ------------------------------- 23.2/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   --------- ------------------------------ 24.2/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   --------- ------------------------------ 25.3/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ---------- ----------------------------- 26.4/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ---------- ----------------------------- 27.5/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ---------- ----------------------------- 28.6/105.0 MB 28.4 MB/s eta 0:00:03\n",
+      "   ----------- ---------------------------- 29.8/105.0 MB 26.2 MB/s eta 0:00:03\n",
+      "   ----------- ---------------------------- 30.9/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ------------ --------------------------- 31.9/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ------------ --------------------------- 33.4/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ------------- -------------------------- 34.5/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ------------- -------------------------- 35.6/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ------------- -------------------------- 36.7/105.0 MB 28.4 MB/s eta 0:00:03\n",
+      "   -------------- ------------------------- 37.8/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   -------------- ------------------------- 38.9/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   --------------- ------------------------ 40.0/105.0 MB 28.4 MB/s eta 0:00:03\n",
+      "   --------------- ------------------------ 41.1/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ---------------- ----------------------- 42.3/105.0 MB 28.4 MB/s eta 0:00:03\n",
+      "   ---------------- ----------------------- 43.1/105.0 MB 26.2 MB/s eta 0:00:03\n",
+      "   ---------------- ----------------------- 44.3/105.0 MB 26.2 MB/s eta 0:00:03\n",
+      "   ----------------- ---------------------- 45.2/105.0 MB 26.2 MB/s eta 0:00:03\n",
+      "   ----------------- ---------------------- 46.4/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ------------------ --------------------- 47.5/105.0 MB 27.3 MB/s eta 0:00:03\n",
+      "   ------------------ --------------------- 48.8/105.0 MB 26.2 MB/s eta 0:00:03\n",
+      "   ------------------ --------------------- 49.8/105.0 MB 26.2 MB/s eta 0:00:03\n",
+      "   ------------------- -------------------- 50.8/105.0 MB 26.2 MB/s eta 0:00:03\n",
+      "   ------------------- -------------------- 51.6/105.0 MB 25.1 MB/s eta 0:00:03\n",
+      "   -------------------- ------------------- 52.9/105.0 MB 25.2 MB/s eta 0:00:03\n",
+      "   -------------------- ------------------- 54.1/105.0 MB 25.2 MB/s eta 0:00:03\n",
+      "   --------------------- ------------------ 55.4/105.0 MB 25.2 MB/s eta 0:00:02\n",
+      "   --------------------- ------------------ 56.6/105.0 MB 25.2 MB/s eta 0:00:02\n",
+      "   ---------------------- ----------------- 57.9/105.0 MB 25.1 MB/s eta 0:00:02\n",
+      "   ---------------------- ----------------- 59.2/105.0 MB 25.1 MB/s eta 0:00:02\n",
+      "   ----------------------- ---------------- 60.4/105.0 MB 26.2 MB/s eta 0:00:02\n",
+      "   ----------------------- ---------------- 61.7/105.0 MB 26.2 MB/s eta 0:00:02\n",
+      "   ----------------------- ---------------- 62.9/105.0 MB 27.3 MB/s eta 0:00:02\n",
+      "   ------------------------ --------------- 64.2/105.0 MB 27.3 MB/s eta 0:00:02\n",
+      "   ------------------------ --------------- 65.5/105.0 MB 27.3 MB/s eta 0:00:02\n",
+      "   ------------------------- -------------- 66.5/105.0 MB 27.3 MB/s eta 0:00:02\n",
+      "   ------------------------- -------------- 67.7/105.0 MB 28.5 MB/s eta 0:00:02\n",
+      "   -------------------------- ------------- 68.8/105.0 MB 28.5 MB/s eta 0:00:02\n",
+      "   -------------------------- ------------- 69.9/105.0 MB 27.3 MB/s eta 0:00:02\n",
+      "   --------------------------- ------------ 70.9/105.0 MB 28.5 MB/s eta 0:00:02\n",
+      "   --------------------------- ------------ 71.9/105.0 MB 27.3 MB/s eta 0:00:02\n",
+      "   --------------------------- ------------ 72.9/105.0 MB 26.2 MB/s eta 0:00:02\n",
+      "   ---------------------------- ----------- 73.7/105.0 MB 26.2 MB/s eta 0:00:02\n",
+      "   ---------------------------- ----------- 74.0/105.0 MB 26.2 MB/s eta 0:00:02\n",
+      "   ---------------------------- ----------- 75.1/105.0 MB 24.2 MB/s eta 0:00:02\n",
+      "   ----------------------------- ---------- 76.2/105.0 MB 24.3 MB/s eta 0:00:02\n",
+      "   ----------------------------- ---------- 77.3/105.0 MB 23.4 MB/s eta 0:00:02\n",
+      "   ----------------------------- ---------- 78.3/105.0 MB 24.2 MB/s eta 0:00:02\n",
+      "   ------------------------------ --------- 78.9/105.0 MB 23.4 MB/s eta 0:00:02\n",
+      "   ------------------------------ --------- 80.3/105.0 MB 22.6 MB/s eta 0:00:02\n",
+      "   ------------------------------ --------- 81.2/105.0 MB 23.4 MB/s eta 0:00:02\n",
+      "   ------------------------------- -------- 81.8/105.0 MB 21.9 MB/s eta 0:00:02\n",
+      "   ------------------------------- -------- 82.5/105.0 MB 21.1 MB/s eta 0:00:02\n",
+      "   ------------------------------- -------- 83.3/105.0 MB 21.8 MB/s eta 0:00:01\n",
+      "   -------------------------------- ------- 84.1/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   -------------------------------- ------- 84.9/105.0 MB 22.6 MB/s eta 0:00:01\n",
+      "   -------------------------------- ------- 85.8/105.0 MB 21.8 MB/s eta 0:00:01\n",
+      "   --------------------------------- ------ 86.6/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------- ------ 87.5/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------- ------ 88.3/105.0 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------- ------ 89.1/105.0 MB 19.9 MB/s eta 0:00:01\n",
+      "   ---------------------------------- ----- 89.9/105.0 MB 19.8 MB/s eta 0:00:01\n",
+      "   ---------------------------------- ----- 90.8/105.0 MB 19.8 MB/s eta 0:00:01\n",
+      "   ---------------------------------- ----- 91.6/105.0 MB 20.5 MB/s eta 0:00:01\n",
+      "   ----------------------------------- ---- 92.3/105.0 MB 19.8 MB/s eta 0:00:01\n",
+      "   ----------------------------------- ---- 93.2/105.0 MB 20.5 MB/s eta 0:00:01\n",
+      "   ----------------------------------- ---- 94.0/105.0 MB 20.5 MB/s eta 0:00:01\n",
+      "   ------------------------------------ --- 94.6/105.0 MB 19.8 MB/s eta 0:00:01\n",
+      "   ------------------------------------ --- 95.5/105.0 MB 20.5 MB/s eta 0:00:01\n",
+      "   ------------------------------------ --- 96.3/105.0 MB 19.8 MB/s eta 0:00:01\n",
+      "   ------------------------------------- -- 97.2/105.0 MB 20.5 MB/s eta 0:00:01\n",
+      "   ------------------------------------- -- 98.0/105.0 MB 19.8 MB/s eta 0:00:01\n",
+      "   ------------------------------------- -- 98.9/105.0 MB 19.8 MB/s eta 0:00:01\n",
+      "   -------------------------------------- - 99.8/105.0 MB 20.5 MB/s eta 0:00:01\n",
+      "   ------------------------------------- - 100.6/105.0 MB 20.5 MB/s eta 0:00:01\n",
+      "   ------------------------------------- - 101.6/105.0 MB 19.8 MB/s eta 0:00:01\n",
+      "   --------------------------------------  102.4/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------------  103.3/105.0 MB 20.5 MB/s eta 0:00:01\n",
+      "   --------------------------------------  104.1/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------------  105.0/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------------  105.0/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------------  105.0/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------------  105.0/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------------  105.0/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------------  105.0/105.0 MB 21.1 MB/s eta 0:00:01\n",
+      "   --------------------------------------- 105.0/105.0 MB 14.5 MB/s eta 0:00:00\n",
+      "Downloading scipy-1.12.0-cp311-cp311-win_amd64.whl (46.2 MB)\n",
+      "   ---------------------------------------- 0.0/46.2 MB ? eta -:--:--\n",
+      "   - -------------------------------------- 1.2/46.2 MB 25.1 MB/s eta 0:00:02\n",
+      "   - -------------------------------------- 2.1/46.2 MB 26.4 MB/s eta 0:00:02\n",
+      "   -- ------------------------------------- 2.9/46.2 MB 23.3 MB/s eta 0:00:02\n",
+      "   --- ------------------------------------ 3.8/46.2 MB 24.4 MB/s eta 0:00:02\n",
+      "   ---- ----------------------------------- 4.7/46.2 MB 23.3 MB/s eta 0:00:02\n",
+      "   ---- ----------------------------------- 5.6/46.2 MB 22.4 MB/s eta 0:00:02\n",
+      "   ----- ---------------------------------- 6.5/46.2 MB 22.0 MB/s eta 0:00:02\n",
+      "   ------ --------------------------------- 7.4/46.2 MB 22.6 MB/s eta 0:00:02\n",
+      "   ------- -------------------------------- 8.3/46.2 MB 23.2 MB/s eta 0:00:02\n",
+      "   ------- -------------------------------- 9.1/46.2 MB 22.4 MB/s eta 0:00:02\n",
+      "   -------- ------------------------------- 10.2/46.2 MB 22.4 MB/s eta 0:00:02\n",
+      "   --------- ------------------------------ 10.9/46.2 MB 21.8 MB/s eta 0:00:02\n",
+      "   ---------- ----------------------------- 11.7/46.2 MB 21.8 MB/s eta 0:00:02\n",
+      "   ---------- ----------------------------- 12.6/46.2 MB 21.1 MB/s eta 0:00:02\n",
+      "   ----------- ---------------------------- 13.3/46.2 MB 21.8 MB/s eta 0:00:02\n",
+      "   ------------ --------------------------- 14.2/46.2 MB 21.1 MB/s eta 0:00:02\n",
+      "   ------------ --------------------------- 14.8/46.2 MB 21.1 MB/s eta 0:00:02\n",
+      "   ------------- -------------------------- 15.5/46.2 MB 20.5 MB/s eta 0:00:02\n",
+      "   ------------- -------------------------- 16.1/46.2 MB 19.8 MB/s eta 0:00:02\n",
+      "   -------------- ------------------------- 16.8/46.2 MB 19.9 MB/s eta 0:00:02\n",
+      "   --------------- ------------------------ 17.5/46.2 MB 19.3 MB/s eta 0:00:02\n",
+      "   --------------- ------------------------ 18.1/46.2 MB 18.2 MB/s eta 0:00:02\n",
+      "   --------------- ------------------------ 18.5/46.2 MB 18.2 MB/s eta 0:00:02\n",
+      "   ---------------- ----------------------- 19.5/46.2 MB 18.2 MB/s eta 0:00:02\n",
+      "   ----------------- ---------------------- 20.2/46.2 MB 18.2 MB/s eta 0:00:02\n",
+      "   ------------------ --------------------- 20.9/46.2 MB 17.7 MB/s eta 0:00:02\n",
+      "   ------------------ --------------------- 21.5/46.2 MB 17.2 MB/s eta 0:00:02\n",
+      "   ------------------- -------------------- 22.1/46.2 MB 16.8 MB/s eta 0:00:02\n",
+      "   ------------------- -------------------- 22.8/46.2 MB 16.4 MB/s eta 0:00:02\n",
+      "   -------------------- ------------------- 23.4/46.2 MB 16.4 MB/s eta 0:00:02\n",
+      "   -------------------- ------------------- 24.2/46.2 MB 16.4 MB/s eta 0:00:02\n",
+      "   --------------------- ------------------ 24.8/46.2 MB 16.0 MB/s eta 0:00:02\n",
+      "   ---------------------- ----------------- 25.5/46.2 MB 16.0 MB/s eta 0:00:02\n",
+      "   ---------------------- ----------------- 26.2/46.2 MB 16.8 MB/s eta 0:00:02\n",
+      "   ----------------------- ---------------- 26.9/46.2 MB 16.4 MB/s eta 0:00:02\n",
+      "   ----------------------- ---------------- 27.6/46.2 MB 16.8 MB/s eta 0:00:02\n",
+      "   ------------------------ --------------- 28.3/46.2 MB 16.8 MB/s eta 0:00:02\n",
+      "   ------------------------- -------------- 29.1/46.2 MB 16.8 MB/s eta 0:00:02\n",
+      "   ------------------------- -------------- 29.8/46.2 MB 16.8 MB/s eta 0:00:01\n",
+      "   -------------------------- ------------- 30.5/46.2 MB 16.8 MB/s eta 0:00:01\n",
+      "   --------------------------- ------------ 31.3/46.2 MB 17.2 MB/s eta 0:00:01\n",
+      "   --------------------------- ------------ 32.0/46.2 MB 17.3 MB/s eta 0:00:01\n",
+      "   ---------------------------- ----------- 32.7/46.2 MB 17.3 MB/s eta 0:00:01\n",
+      "   ----------------------------- ---------- 33.5/46.2 MB 17.7 MB/s eta 0:00:01\n",
+      "   ----------------------------- ---------- 34.2/46.2 MB 17.3 MB/s eta 0:00:01\n",
+      "   ------------------------------ --------- 34.9/46.2 MB 17.7 MB/s eta 0:00:01\n",
+      "   ------------------------------ --------- 35.7/46.2 MB 17.7 MB/s eta 0:00:01\n",
+      "   ------------------------------- -------- 36.4/46.2 MB 17.7 MB/s eta 0:00:01\n",
+      "   -------------------------------- ------- 37.2/46.2 MB 18.2 MB/s eta 0:00:01\n",
+      "   -------------------------------- ------- 38.0/46.2 MB 18.2 MB/s eta 0:00:01\n",
+      "   --------------------------------- ------ 38.7/46.2 MB 18.2 MB/s eta 0:00:01\n",
+      "   ---------------------------------- ----- 39.4/46.2 MB 18.2 MB/s eta 0:00:01\n",
+      "   ---------------------------------- ----- 40.2/46.2 MB 18.2 MB/s eta 0:00:01\n",
+      "   ----------------------------------- ---- 41.0/46.2 MB 18.7 MB/s eta 0:00:01\n",
+      "   ------------------------------------ --- 41.8/46.2 MB 18.2 MB/s eta 0:00:01\n",
+      "   ------------------------------------ --- 42.6/46.2 MB 18.2 MB/s eta 0:00:01\n",
+      "   ------------------------------------- -- 43.4/46.2 MB 19.3 MB/s eta 0:00:01\n",
+      "   -------------------------------------- - 44.2/46.2 MB 18.7 MB/s eta 0:00:01\n",
+      "   -------------------------------------- - 44.9/46.2 MB 19.3 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  45.7/46.2 MB 18.7 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  46.2/46.2 MB 18.7 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  46.2/46.2 MB 18.7 MB/s eta 0:00:01\n",
+      "   ---------------------------------------  46.2/46.2 MB 18.7 MB/s eta 0:00:01\n",
+      "   ---------------------------------------- 46.2/46.2 MB 15.9 MB/s eta 0:00:00\n",
+      "Installing collected packages: scipy, bitsandbytes\n",
+      "Successfully installed bitsandbytes-0.42.0 scipy-1.12.0\n",
+      "Requirement already satisfied: bottleneck in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (1.3.5)\n",
+      "Collecting bottleneck\n",
+      "  Downloading Bottleneck-1.3.8-cp311-cp311-win_amd64.whl.metadata (8.1 kB)\n",
+      "Requirement already satisfied: numpy in c:\\users\\boulanger\\appdata\\local\\miniconda3\\lib\\site-packages (from bottleneck) (1.26.3)\n",
+      "Downloading Bottleneck-1.3.8-cp311-cp311-win_amd64.whl (110 kB)\n",
+      "   ---------------------------------------- 0.0/110.1 kB ? eta -:--:--\n",
+      "   ---------- ---------------------------- 30.7/110.1 kB 660.6 kB/s eta 0:00:01\n",
+      "   ------------------------------------- -- 102.4/110.1 kB 1.5 MB/s eta 0:00:01\n",
+      "   ------------------------------------- -- 102.4/110.1 kB 1.5 MB/s eta 0:00:01\n",
+      "   -------------------------------------- 110.1/110.1 kB 799.1 kB/s eta 0:00:00\n",
+      "Installing collected packages: bottleneck\n",
+      "  Attempting uninstall: bottleneck\n",
+      "    Found existing installation: Bottleneck 1.3.5\n",
+      "    Uninstalling Bottleneck-1.3.5:\n",
+      "      Successfully uninstalled Bottleneck-1.3.5\n",
+      "Successfully installed bottleneck-1.3.8\n"
+     ]
+    }
+   ],
    "source": [
     "!pip install transformers datasets peft python-dotenv accelerate trl\n",
     "!pip install -i https://pypi.org/simple/ bitsandbytes\n",
@@ -73,25 +666,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "6b625baf-d995-43b8-b8ce-f95ec624c73a",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "'HF_TOKEN'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[2], line 13\u001b[0m\n\u001b[1;32m      6\u001b[0m model_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgoogle/gemma-2b\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      7\u001b[0m bnb_config \u001b[38;5;241m=\u001b[39m BitsAndBytesConfig(\n\u001b[1;32m      8\u001b[0m     load_in_4bit\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m      9\u001b[0m     bnb_4bit_quant_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnf4\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     10\u001b[0m     bnb_4bit_compute_dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mbfloat16\n\u001b[1;32m     11\u001b[0m )\n\u001b[0;32m---> 13\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_id, token\u001b[38;5;241m=\u001b[39m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menviron\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mHF_TOKEN\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m, padding_side\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mright\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     14\u001b[0m model \u001b[38;5;241m=\u001b[39m AutoModelForCausalLM\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_id, quantization_config\u001b[38;5;241m=\u001b[39mbnb_config, device_map\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m:\u001b[38;5;241m0\u001b[39m}, token\u001b[38;5;241m=\u001b[39mos\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mHF_TOKEN\u001b[39m\u001b[38;5;124m'\u001b[39m])\n",
-      "File \u001b[0;32m/mpcdf/soft/SLE_15/packages/x86_64/anaconda/3/2023.03/lib/python3.10/os.py:680\u001b[0m, in \u001b[0;36m_Environ.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m    677\u001b[0m     value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencodekey(key)]\n\u001b[1;32m    678\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n\u001b[1;32m    679\u001b[0m     \u001b[38;5;66;03m# raise KeyError with the original key value\u001b[39;00m\n\u001b[0;32m--> 680\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m    681\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecodevalue(value)\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'HF_TOKEN'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import torch\n",
     "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
@@ -162,26 +742,26 @@
      "evalue": "CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 23.67 GiB of which 38.19 MiB is free. Process 36053 has 11.64 GiB memory in use. Process 31527 has 7.32 GiB memory in use. Including non-PyTorch memory, this process has 4.63 GiB memory in use. Of the allocated memory 4.34 GiB is allocated by PyTorch, and 9.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
      "output_type": "error",
      "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[4], line 33\u001b[0m\n\u001b[1;32m     13\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m [text]\n\u001b[1;32m     15\u001b[0m trainer \u001b[38;5;241m=\u001b[39m SFTTrainer(\n\u001b[1;32m     16\u001b[0m     model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m     17\u001b[0m     train_dataset\u001b[38;5;241m=\u001b[39mdata[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     31\u001b[0m     formatting_func\u001b[38;5;241m=\u001b[39mformatting_func,\n\u001b[1;32m     32\u001b[0m )\n\u001b[0;32m---> 33\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:331\u001b[0m, in \u001b[0;36mSFTTrainer.train\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    328\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mneftune_noise_alpha \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer_supports_neftune:\n\u001b[1;32m    329\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trl_activate_neftune(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel)\n\u001b[0;32m--> 331\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;66;03m# After training we make sure to retrieve back the original forward pass method\u001b[39;00m\n\u001b[1;32m    334\u001b[0m \u001b[38;5;66;03m# for the embedding layer by removing the forward post hook.\u001b[39;00m\n\u001b[1;32m    335\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mneftune_noise_alpha \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer_supports_neftune:\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:1624\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1622\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1623\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1624\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1625\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1626\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1627\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1628\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1629\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:1961\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   1958\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   1960\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 1961\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1963\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   1964\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   1965\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[1;32m   1966\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   1967\u001b[0m ):\n\u001b[1;32m   1968\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   1969\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2902\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   2899\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   2901\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 2902\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2904\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   2905\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2925\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   2923\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   2924\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 2925\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2926\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   2927\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   2928\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/accelerate/utils/operations.py:817\u001b[0m, in \u001b[0;36mconvert_outputs_to_fp32.<locals>.forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    816\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 817\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/accelerate/utils/operations.py:805\u001b[0m, in \u001b[0;36mConvertOutputsToFp32.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    804\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 805\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/amp/autocast_mode.py:16\u001b[0m, in \u001b[0;36mautocast_decorator.<locals>.decorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_autocast\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     15\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[0;32m---> 16\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/peft/peft_model.py:1091\u001b[0m, in \u001b[0;36mPeftModelForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)\u001b[0m\n\u001b[1;32m   1089\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m peft_config\u001b[38;5;241m.\u001b[39mpeft_type \u001b[38;5;241m==\u001b[39m PeftType\u001b[38;5;241m.\u001b[39mPOLY:\n\u001b[1;32m   1090\u001b[0m         kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtask_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m task_ids\n\u001b[0;32m-> 1091\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbase_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1092\u001b[0m \u001b[43m        \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1093\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1094\u001b[0m \u001b[43m        \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1095\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlabels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1096\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1097\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1098\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1099\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1100\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1102\u001b[0m batch_size \u001b[38;5;241m=\u001b[39m _get_batch_size(input_ids, inputs_embeds)\n\u001b[1;32m   1103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attention_mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1104\u001b[0m     \u001b[38;5;66;03m# concat prompt attention mask\u001b[39;00m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:160\u001b[0m, in \u001b[0;36mBaseTuner.forward\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    159\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any):\n\u001b[0;32m--> 160\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/accelerate/hooks.py:166\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    164\u001b[0m         output \u001b[38;5;241m=\u001b[39m module\u001b[38;5;241m.\u001b[39m_old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    165\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 166\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_old_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    167\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py:1088\u001b[0m, in \u001b[0;36mGemmaForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)\u001b[0m\n\u001b[1;32m   1086\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1087\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(hidden_states)\n\u001b[0;32m-> 1088\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[43mlogits\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1089\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1090\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m labels \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1091\u001b[0m     \u001b[38;5;66;03m# Shift so that tokens < n predict n\u001b[39;00m\n",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 23.67 GiB of which 38.19 MiB is free. Process 36053 has 11.64 GiB memory in use. Process 31527 has 7.32 GiB memory in use. Including non-PyTorch memory, this process has 4.63 GiB memory in use. Of the allocated memory 4.34 GiB is allocated by PyTorch, and 9.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mOutOfMemoryError\u001B[0m                          Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[4], line 33\u001B[0m\n\u001B[1;32m     13\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m [text]\n\u001B[1;32m     15\u001B[0m trainer \u001B[38;5;241m=\u001B[39m SFTTrainer(\n\u001B[1;32m     16\u001B[0m     model\u001B[38;5;241m=\u001B[39mmodel,\n\u001B[1;32m     17\u001B[0m     train_dataset\u001B[38;5;241m=\u001B[39mdata[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtrain\u001B[39m\u001B[38;5;124m\"\u001B[39m],\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m     31\u001B[0m     formatting_func\u001B[38;5;241m=\u001B[39mformatting_func,\n\u001B[1;32m     32\u001B[0m )\n\u001B[0;32m---> 33\u001B[0m \u001B[43mtrainer\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtrain\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:331\u001B[0m, in \u001B[0;36mSFTTrainer.train\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m    328\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mneftune_noise_alpha \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_trainer_supports_neftune:\n\u001B[1;32m    329\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_trl_activate_neftune(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel)\n\u001B[0;32m--> 331\u001B[0m output \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtrain\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    333\u001B[0m \u001B[38;5;66;03m# After training we make sure to retrieve back the original forward pass method\u001B[39;00m\n\u001B[1;32m    334\u001B[0m \u001B[38;5;66;03m# for the embedding layer by removing the forward post hook.\u001B[39;00m\n\u001B[1;32m    335\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mneftune_noise_alpha \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_trainer_supports_neftune:\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:1624\u001B[0m, in \u001B[0;36mTrainer.train\u001B[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001B[0m\n\u001B[1;32m   1622\u001B[0m         hf_hub_utils\u001B[38;5;241m.\u001B[39menable_progress_bars()\n\u001B[1;32m   1623\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1624\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43minner_training_loop\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m   1625\u001B[0m \u001B[43m        \u001B[49m\u001B[43margs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1626\u001B[0m \u001B[43m        \u001B[49m\u001B[43mresume_from_checkpoint\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mresume_from_checkpoint\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1627\u001B[0m \u001B[43m        \u001B[49m\u001B[43mtrial\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtrial\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1628\u001B[0m \u001B[43m        \u001B[49m\u001B[43mignore_keys_for_eval\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mignore_keys_for_eval\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1629\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:1961\u001B[0m, in \u001B[0;36mTrainer._inner_training_loop\u001B[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001B[0m\n\u001B[1;32m   1958\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcontrol \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcallback_handler\u001B[38;5;241m.\u001B[39mon_step_begin(args, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstate, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcontrol)\n\u001B[1;32m   1960\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39maccelerator\u001B[38;5;241m.\u001B[39maccumulate(model):\n\u001B[0;32m-> 1961\u001B[0m     tr_loss_step \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtraining_step\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmodel\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1963\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[1;32m   1964\u001B[0m     args\u001B[38;5;241m.\u001B[39mlogging_nan_inf_filter\n\u001B[1;32m   1965\u001B[0m     \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m is_torch_tpu_available()\n\u001B[1;32m   1966\u001B[0m     \u001B[38;5;129;01mand\u001B[39;00m (torch\u001B[38;5;241m.\u001B[39misnan(tr_loss_step) \u001B[38;5;129;01mor\u001B[39;00m torch\u001B[38;5;241m.\u001B[39misinf(tr_loss_step))\n\u001B[1;32m   1967\u001B[0m ):\n\u001B[1;32m   1968\u001B[0m     \u001B[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001B[39;00m\n\u001B[1;32m   1969\u001B[0m     tr_loss \u001B[38;5;241m+\u001B[39m\u001B[38;5;241m=\u001B[39m tr_loss \u001B[38;5;241m/\u001B[39m (\u001B[38;5;241m1\u001B[39m \u001B[38;5;241m+\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstate\u001B[38;5;241m.\u001B[39mglobal_step \u001B[38;5;241m-\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_globalstep_last_logged)\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2902\u001B[0m, in \u001B[0;36mTrainer.training_step\u001B[0;34m(self, model, inputs)\u001B[0m\n\u001B[1;32m   2899\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m loss_mb\u001B[38;5;241m.\u001B[39mreduce_mean()\u001B[38;5;241m.\u001B[39mdetach()\u001B[38;5;241m.\u001B[39mto(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mdevice)\n\u001B[1;32m   2901\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcompute_loss_context_manager():\n\u001B[0;32m-> 2902\u001B[0m     loss \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcompute_loss\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmodel\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   2904\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mn_gpu \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[1;32m   2905\u001B[0m     loss \u001B[38;5;241m=\u001B[39m loss\u001B[38;5;241m.\u001B[39mmean()  \u001B[38;5;66;03m# mean() to average on multi-gpu parallel training\u001B[39;00m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2925\u001B[0m, in \u001B[0;36mTrainer.compute_loss\u001B[0;34m(self, model, inputs, return_outputs)\u001B[0m\n\u001B[1;32m   2923\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m   2924\u001B[0m     labels \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m-> 2925\u001B[0m outputs \u001B[38;5;241m=\u001B[39m \u001B[43mmodel\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43minputs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   2926\u001B[0m \u001B[38;5;66;03m# Save past state if it exists\u001B[39;00m\n\u001B[1;32m   2927\u001B[0m \u001B[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001B[39;00m\n\u001B[1;32m   2928\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mpast_index \u001B[38;5;241m>\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m0\u001B[39m:\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m   1509\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)  \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[1;32m   1510\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1511\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_call_impl\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m   1515\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m   1516\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m   1517\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[1;32m   1518\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m   1519\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1520\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1522\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m   1523\u001B[0m     result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/accelerate/utils/operations.py:817\u001B[0m, in \u001B[0;36mconvert_outputs_to_fp32.<locals>.forward\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m    816\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mforward\u001B[39m(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs):\n\u001B[0;32m--> 817\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mmodel_forward\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/accelerate/utils/operations.py:805\u001B[0m, in \u001B[0;36mConvertOutputsToFp32.__call__\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m    804\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__call__\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs):\n\u001B[0;32m--> 805\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m convert_to_fp32(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmodel_forward\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m)\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/torch/amp/autocast_mode.py:16\u001B[0m, in \u001B[0;36mautocast_decorator.<locals>.decorate_autocast\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m     13\u001B[0m \u001B[38;5;129m@functools\u001B[39m\u001B[38;5;241m.\u001B[39mwraps(func)\n\u001B[1;32m     14\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecorate_autocast\u001B[39m(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs):\n\u001B[1;32m     15\u001B[0m     \u001B[38;5;28;01mwith\u001B[39;00m autocast_instance:\n\u001B[0;32m---> 16\u001B[0m         \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfunc\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/peft/peft_model.py:1091\u001B[0m, in \u001B[0;36mPeftModelForCausalLM.forward\u001B[0;34m(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)\u001B[0m\n\u001B[1;32m   1089\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m peft_config\u001B[38;5;241m.\u001B[39mpeft_type \u001B[38;5;241m==\u001B[39m PeftType\u001B[38;5;241m.\u001B[39mPOLY:\n\u001B[1;32m   1090\u001B[0m         kwargs[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtask_ids\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m task_ids\n\u001B[0;32m-> 1091\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbase_model\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m   1092\u001B[0m \u001B[43m        \u001B[49m\u001B[43minput_ids\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43minput_ids\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1093\u001B[0m \u001B[43m        \u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mattention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1094\u001B[0m \u001B[43m        \u001B[49m\u001B[43minputs_embeds\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43minputs_embeds\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1095\u001B[0m \u001B[43m        \u001B[49m\u001B[43mlabels\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mlabels\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1096\u001B[0m \u001B[43m        \u001B[49m\u001B[43moutput_attentions\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_attentions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1097\u001B[0m \u001B[43m        \u001B[49m\u001B[43moutput_hidden_states\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_hidden_states\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1098\u001B[0m \u001B[43m        \u001B[49m\u001B[43mreturn_dict\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mreturn_dict\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1099\u001B[0m \u001B[43m        \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1100\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1102\u001B[0m batch_size \u001B[38;5;241m=\u001B[39m _get_batch_size(input_ids, inputs_embeds)\n\u001B[1;32m   1103\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m attention_mask \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m   1104\u001B[0m     \u001B[38;5;66;03m# concat prompt attention mask\u001B[39;00m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m   1509\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)  \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[1;32m   1510\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1511\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_call_impl\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m   1515\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m   1516\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m   1517\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[1;32m   1518\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m   1519\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1520\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1522\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m   1523\u001B[0m     result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:160\u001B[0m, in \u001B[0;36mBaseTuner.forward\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m    159\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mforward\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs: Any, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs: Any):\n\u001B[0;32m--> 160\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmodel\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mforward\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/accelerate/hooks.py:166\u001B[0m, in \u001B[0;36madd_hook_to_module.<locals>.new_forward\u001B[0;34m(module, *args, **kwargs)\u001B[0m\n\u001B[1;32m    164\u001B[0m         output \u001B[38;5;241m=\u001B[39m module\u001B[38;5;241m.\u001B[39m_old_forward(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m    165\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m--> 166\u001B[0m     output \u001B[38;5;241m=\u001B[39m \u001B[43mmodule\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_old_forward\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    167\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m module\u001B[38;5;241m.\u001B[39m_hf_hook\u001B[38;5;241m.\u001B[39mpost_forward(module, output)\n",
+      "File \u001B[0;32m~/.local/lib/python3.10/site-packages/transformers/models/gemma/modeling_gemma.py:1088\u001B[0m, in \u001B[0;36mGemmaForCausalLM.forward\u001B[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)\u001B[0m\n\u001B[1;32m   1086\u001B[0m hidden_states \u001B[38;5;241m=\u001B[39m outputs[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m   1087\u001B[0m logits \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlm_head(hidden_states)\n\u001B[0;32m-> 1088\u001B[0m logits \u001B[38;5;241m=\u001B[39m \u001B[43mlogits\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfloat\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1089\u001B[0m loss \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m   1090\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m labels \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m   1091\u001B[0m     \u001B[38;5;66;03m# Shift so that tokens < n predict n\u001B[39;00m\n",
+      "\u001B[0;31mOutOfMemoryError\u001B[0m: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 23.67 GiB of which 38.19 MiB is free. Process 36053 has 11.64 GiB memory in use. Process 31527 has 7.32 GiB memory in use. Including non-PyTorch memory, this process has 4.63 GiB memory in use. Of the allocated memory 4.34 GiB is allocated by PyTorch, and 9.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
      ]
     }
    ],
@@ -232,9 +812,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Anaconda 2023.03",
+   "name": "python3",
    "language": "python",
-   "name": "anaconda_3_2023_03"
+   "display_name": "Python 3 (ipykernel)"
   },
   "language_info": {
    "codemirror_mode": {
-- 
GitLab