diff --git a/wikidata/plot-timeline.ipynb b/wikidata/plot-timeline.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..701b71a6d297f62e9810e49eb8d5ead83f089be2 --- /dev/null +++ b/wikidata/plot-timeline.ipynb @@ -0,0 +1,120 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import plotly.express as px\n", + "\n", + "# Load the data\n", + "df = pd.read_csv(\"scholars.csv\", encoding='utf-8')\n", + "\n", + "# Initialize a list to track the last dateOfDeath in each row to manage overlaps\n", + "last_dates = []\n", + "\n", + "# Function to find the appropriate row for each scholar\n", + "def find_row(last_dates, start_date):\n", + " for i, last_date in enumerate(last_dates):\n", + " if start_date > last_date:\n", + " return i\n", + " return len(last_dates)\n", + "\n", + "# Assign rows without overlaps and sort by the earliest dateOfBirth\n", + "df['row'] = 0\n", + "for index, scholar in df.iterrows():\n", + " row = find_row(last_dates, scholar['dateOfBirth'])\n", + " if row < len(last_dates):\n", + " last_dates[row] = scholar['dateOfDeath']\n", + " else:\n", + " last_dates.append(scholar['dateOfDeath'])\n", + " df.at[index, 'row'] = row\n", + "\n", + "# Now plotting without row labels\n", + "fig = px.timeline(df, x_start=\"dateOfBirth\", x_end=\"dateOfDeath\", y=\"row\", text=\"fullName\", title=\"Scholars' Life Spans Timeline\")\n", + "\n", + "# Update layout\n", + "fig.update_layout(yaxis=dict(tickmode='array', tickvals=[], ticktext=[]))\n", + "fig.update_yaxes(autorange=\"reversed\") # This reverses the y-axis to match your requirement\n", + "\n", + "fig.show()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from datetime import datetime\n", + "\n", + "# Assuming df is your existing DataFrame\n", + "\n", + "# Convert dateOfBirth and dateOfDeath to just the year, handle NaT/NaN appropriately\n", + "df['Year'] = pd.to_datetime(df['dateOfBirth'], errors='coerce').dt.year.astype('Int64')\n", + "df['End Year'] = pd.to_datetime(df['dateOfDeath'], errors='coerce').dt.year.astype('Int64')\n", + "\n", + "# Create 'Display Date' as \"dateOfBirth - dateOfDeath\"\n", + "df['Display Date'] = df['Year'].astype(str).replace('<NA>','') + ' - ' + df['End Year'].astype(str).replace('<NA>','')\n", + "\n", + "# Create 'Headline' as \"fullName (dateOfBirth - dateOfDeath)\"\n", + "df['Headline'] = df['fullName'] + ' (' + df['Display Date'] + ')'\n", + "\n", + "# Create 'Text' column by combining occupation, fieldOfWork, employer\n", + "df['Text'] = df[['occupation', 'fieldOfWork']].apply(lambda x: '<br>'.join(x.dropna()), axis=1)\n", + "\n", + "# Use the image directly; assuming the URLs are already correctly formed in the 'image' column\n", + "df['Media'] = df['image']\n", + "\n", + "# Add a \"Group\" column with the value \"actors\" for all rows\n", + "df['Group'] = 'actors'\n", + "\n", + "# fix date columns\n", + "df['Display Date'] = df['Display Date'].fillna('') # Ensure no NaNs in Display Date\n", + "df['Headline'] = df['Headline'].fillna('') # Ensure no NaNs in Headline\n", + "df['Text'] = df['Text'].fillna('') # Ensure no NaNs in Text\n", + "df['Media'] = df['Media'].fillna('') # Ensure no NaNs in Media\n", + "\n", + "# Now select and order the DataFrame according to the TimelineJS template requirements\n", + "columns = \"Year\tMonth\tDay\tTime\tEnd Year\tEnd Month\tEnd Day\tEnd Time\tDisplay Date\tHeadline\tText\tMedia\tMedia Credit\tMedia Caption\tMedia Thumbnail\tType\tGroup\tBackground\tLink\".split(\"\\t\")\n", + "for col in columns:\n", + " if col not in df:\n", + " df[col] = ''\n", + "timeline_df = df[columns]\n", + "\n", + "timeline_df.to_excel(\"timeline_data.xlsx\", index=False)\n" + ], + "metadata": { + "collapsed": false + }, + "id": "f774e82925504bd" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/wikidata/scholars-de.ipynb b/wikidata/scholars-de.ipynb index 658f3070e5c404c1e6c6d80873090af05c5acca2..02784afc3ecbaf90641180c3eab54c0e7190bbeb 100644 --- a/wikidata/scholars-de.ipynb +++ b/wikidata/scholars-de.ipynb @@ -248,132 +248,6 @@ } }, "id": "2d7bdaeed0f38415" - }, - { - "cell_type": "code", - "execution_count": 13, - "outputs": [ - { - "ename": "TypeError", - "evalue": "'>' not supported between instances of 'float' and 'str'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mTypeError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[13], line 20\u001B[0m\n\u001B[0;32m 18\u001B[0m df[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mrow\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;241m0\u001B[39m\n\u001B[0;32m 19\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m index, scholar \u001B[38;5;129;01min\u001B[39;00m df\u001B[38;5;241m.\u001B[39miterrows():\n\u001B[1;32m---> 20\u001B[0m row \u001B[38;5;241m=\u001B[39m \u001B[43mfind_row\u001B[49m\u001B[43m(\u001B[49m\u001B[43mlast_dates\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mscholar\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mdateOfBirth\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 21\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m row \u001B[38;5;241m<\u001B[39m \u001B[38;5;28mlen\u001B[39m(last_dates):\n\u001B[0;32m 22\u001B[0m last_dates[row] \u001B[38;5;241m=\u001B[39m scholar[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mdateOfDeath\u001B[39m\u001B[38;5;124m'\u001B[39m]\n", - "Cell \u001B[1;32mIn[13], line 13\u001B[0m, in \u001B[0;36mfind_row\u001B[1;34m(last_dates, start_date)\u001B[0m\n\u001B[0;32m 11\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfind_row\u001B[39m(last_dates, start_date):\n\u001B[0;32m 12\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m i, last_date \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(last_dates):\n\u001B[1;32m---> 13\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[43mstart_date\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m>\u001B[39;49m\u001B[43m \u001B[49m\u001B[43mlast_date\u001B[49m:\n\u001B[0;32m 14\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m i\n\u001B[0;32m 15\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(last_dates)\n", - "\u001B[1;31mTypeError\u001B[0m: '>' not supported between instances of 'float' and 'str'" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import plotly.express as px\n", - "\n", - "# Load the data\n", - "df = pd.read_csv(\"scholars.csv\", encoding='utf-8')\n", - "\n", - "# Initialize a list to track the last dateOfDeath in each row to manage overlaps\n", - "last_dates = []\n", - "\n", - "# Function to find the appropriate row for each scholar\n", - "def find_row(last_dates, start_date):\n", - " for i, last_date in enumerate(last_dates):\n", - " if start_date > last_date:\n", - " return i\n", - " return len(last_dates)\n", - "\n", - "# Assign rows without overlaps and sort by the earliest dateOfBirth\n", - "df['row'] = 0\n", - "for index, scholar in df.iterrows():\n", - " row = find_row(last_dates, scholar['dateOfBirth'])\n", - " if row < len(last_dates):\n", - " last_dates[row] = scholar['dateOfDeath']\n", - " else:\n", - " last_dates.append(scholar['dateOfDeath'])\n", - " df.at[index, 'row'] = row\n", - "\n", - "# Now plotting without row labels\n", - "fig = px.timeline(df, x_start=\"dateOfBirth\", x_end=\"dateOfDeath\", y=\"row\", text=\"fullName\", title=\"Scholars' Life Spans Timeline\")\n", - "\n", - "# Update layout\n", - "fig.update_layout(yaxis=dict(tickmode='array', tickvals=[], ticktext=[]))\n", - "fig.update_yaxes(autorange=\"reversed\") # This reverses the y-axis to match your requirement\n", - "\n", - "fig.show()\n", - "\n" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-11T22:37:55.002823500Z", - "start_time": "2024-03-11T22:37:53.483582400Z" - } - }, - "id": "9bdf188991f29962" - }, - { - "cell_type": "code", - "execution_count": 44, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from datetime import datetime\n", - "\n", - "# Assuming df is your existing DataFrame\n", - "\n", - "# Convert dateOfBirth and dateOfDeath to just the year, handle NaT/NaN appropriately\n", - "df['Year'] = pd.to_datetime(df['dateOfBirth'], errors='coerce').dt.year.astype('Int64')\n", - "df['End Year'] = pd.to_datetime(df['dateOfDeath'], errors='coerce').dt.year.astype('Int64')\n", - "\n", - "# Create 'Display Date' as \"dateOfBirth - dateOfDeath\"\n", - "df['Display Date'] = df['Year'].astype(str).replace('<NA>','') + ' - ' + df['End Year'].astype(str).replace('<NA>','')\n", - "\n", - "# Create 'Headline' as \"fullName (dateOfBirth - dateOfDeath)\"\n", - "df['Headline'] = df['fullName'] + ' (' + df['Display Date'] + ')'\n", - "\n", - "# Create 'Text' column by combining occupation, fieldOfWork, employer\n", - "df['Text'] = df[['occupation', 'fieldOfWork']].apply(lambda x: '<br>'.join(x.dropna()), axis=1)\n", - "\n", - "# Use the image directly; assuming the URLs are already correctly formed in the 'image' column\n", - "df['Media'] = df['image']\n", - "\n", - "# Add a \"Group\" column with the value \"actors\" for all rows\n", - "df['Group'] = 'actors'\n", - "\n", - "# fix date columns\n", - "df['Display Date'] = df['Display Date'].fillna('') # Ensure no NaNs in Display Date\n", - "df['Headline'] = df['Headline'].fillna('') # Ensure no NaNs in Headline\n", - "df['Text'] = df['Text'].fillna('') # Ensure no NaNs in Text\n", - "df['Media'] = df['Media'].fillna('') # Ensure no NaNs in Media\n", - "\n", - "# Now select and order the DataFrame according to the TimelineJS template requirements\n", - "columns = \"Year\tMonth\tDay\tTime\tEnd Year\tEnd Month\tEnd Day\tEnd Time\tDisplay Date\tHeadline\tText\tMedia\tMedia Credit\tMedia Caption\tMedia Thumbnail\tType\tGroup\tBackground\tLink\".split(\"\\t\")\n", - "for col in columns:\n", - " if col not in df:\n", - " df[col] = ''\n", - "timeline_df = df[columns]\n", - "\n", - "timeline_df.to_excel(\"timeline_data.xlsx\", index=False)\n" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-06T15:08:24.294712700Z", - "start_time": "2024-03-06T15:08:24.250520100Z" - } - }, - "id": "b8058de7fa9212b" - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [], - "metadata": { - "collapsed": false - }, - "id": "f4b14ea7d4941e57" } ], "metadata": {