From 5dee5c1eca1c8d804b6f1ea0634c65fcbe251696 Mon Sep 17 00:00:00 2001
From: Christian Boulanger <boulanger@lhlt.mpg.de>
Date: Tue, 5 Mar 2024 09:41:22 +0100
Subject: [PATCH] add files from older experiments

---
 biographic-timelines/jls-editorial-board.rmd  |  35 +++
 .../output/editors-hf-llama2-13b-chat.csv     |   5 +
 ...{hf_llama2_chat_gptq.py => hf_endpoint.py} |   2 +-
 networks-over-time/jls-network-over-time.R    | 210 ++++++++++++++++++
 r-shiny-server/jls-search-authors.R           |  38 ++++
 5 files changed, 289 insertions(+), 1 deletion(-)
 create mode 100644 biographic-timelines/jls-editorial-board.rmd
 create mode 100644 langchain-experiments/data/output/editors-hf-llama2-13b-chat.csv
 rename langchain-experiments/lib/{hf_llama2_chat_gptq.py => hf_endpoint.py} (95%)
 create mode 100644 networks-over-time/jls-network-over-time.R
 create mode 100644 r-shiny-server/jls-search-authors.R

diff --git a/biographic-timelines/jls-editorial-board.rmd b/biographic-timelines/jls-editorial-board.rmd
new file mode 100644
index 0000000..c295290
--- /dev/null
+++ b/biographic-timelines/jls-editorial-board.rmd
@@ -0,0 +1,35 @@
+---
+title: "People involved in the JLS"
+output: html_notebook
+---
+
+```{r}
+library(readxl)
+library(dplyr)
+library(timevis)
+library(stringr)
+library(htmlwidgets)
+
+vertical_stripes = "background: repeating-linear-gradient(to right, transparent, transparent 2px, lightgray 2px, lightgray 4px);"
+diagonal_stripes = "background: repeating-linear-gradient(135deg, transparent, transparent 2px, lightgray 2px, lightgray 4px);"
+
+editors <- read_excel("data/jls-editors.xlsx")  %>%
+  mutate(
+    content = name,
+    title = name,
+    start = paste(start, "-06-01", sep=""),
+    end = str_c(ifelse(is.na(end), "2025", end), "-06-01"),
+    style = case_when(
+      gender == "Male"   ~ vertical_stripes,
+      gender == "Female" ~ "background-color: white",
+      TRUE ~ "background-color: white"
+    )
+  )
+hw <- timevis(editors,
+        showZoom = FALSE,
+        width = "1500px",
+        options = list(start="1974-01-01", end = "2025-12-31", selectable=FALSE, showCurrentTime = FALSE))
+
+saveWidget(hw, "docs/jls-editors.html")
+unlink("docs/jls-editors_files", recursive = TRUE)
+```
\ No newline at end of file
diff --git a/langchain-experiments/data/output/editors-hf-llama2-13b-chat.csv b/langchain-experiments/data/output/editors-hf-llama2-13b-chat.csv
new file mode 100644
index 0000000..68f7baa
--- /dev/null
+++ b/langchain-experiments/data/output/editors-hf-llama2-13b-chat.csv
@@ -0,0 +1,5 @@
+Martinez,José,Professor
+Busse,Regierungsdirektor
+Ewald,Endres,Forst,Jurist
+Glas,VolkmarNies,Koch
+Note: Some positions have been omitted as they are not explicitly mentioned in the text.
\ No newline at end of file
diff --git a/langchain-experiments/lib/hf_llama2_chat_gptq.py b/langchain-experiments/lib/hf_endpoint.py
similarity index 95%
rename from langchain-experiments/lib/hf_llama2_chat_gptq.py
rename to langchain-experiments/lib/hf_endpoint.py
index cdd1a6f..9ecc1e8 100644
--- a/langchain-experiments/lib/hf_llama2_chat_gptq.py
+++ b/langchain-experiments/lib/hf_endpoint.py
@@ -18,7 +18,7 @@ def query(url, template, model_params = None, **params):
             "temperature": 0.1,
             "max_new_tokens": 2000
         }
-    inputs = template.format_map(**params)
+    inputs = template.format_map(params)
     payload = {
         "inputs": inputs,
         "parameters": model_params
diff --git a/networks-over-time/jls-network-over-time.R b/networks-over-time/jls-network-over-time.R
new file mode 100644
index 0000000..8152a5f
--- /dev/null
+++ b/networks-over-time/jls-network-over-time.R
@@ -0,0 +1,210 @@
+library(network)
+library(networkDynamic)
+library(tidyverse)
+library(ndtv)
+
+# config
+top_n_cited <- 20
+top_n_citing <- 20
+year_start <- 2000
+year_end <- 2010
+years_per_slice <- 5
+
+cat("Importing data for",as.character(year_start),"-",as.character(year_end),fill=TRUE)
+df <- read.csv("data/jls-author-network-owndata.csv", encoding = "UTF-8") |>
+  filter(pub_year >= year_start & pub_year <= year_end) |>
+  rename(from = citing_author) |>
+  rename(to = cited_author) |>
+  rename(year = pub_year) |>
+  rename(count = citation_count)
+
+# Create a lookup table for journal titles to IDs
+names <- unique(c(df$from, df$to))
+vertices <- tibble(id = seq_along(names), name = names)
+
+# Convert source and target authors to ids
+data <- df |>
+  left_join(vertices, by = c("from" = "name")) |>
+  select(-from) |>
+  rename(from = id) |>
+  left_join(vertices, by = c("to" = "name")) |>
+  select(-to) |>
+  rename(to = id) |>
+  select(from, to, year, count) |>
+  filter(from != to) # remove self-citations
+
+cat("Found", as.character(nrow(data)), "items.",fill=TRUE)
+
+cat("Determine the", as.character(top_n_cited), "most cited authors in ",
+    as.character(years_per_slice), "-year windows",fill=TRUE)
+
+# create sliding time windows
+sliding_window <- function(year) {
+  seq(year - 2, year + 2)
+}
+tmp <- data |>
+  rowwise() |>
+  mutate(window = list(sliding_window(year))) |>
+  unnest(window)
+
+# Find the n most-cited items within each 5-year window
+top_to_values <- tmp |>
+  group_by(window, to) |>
+  summarise(n = n(), .groups = 'drop') |>
+  arrange(window, desc(n)) |>
+  group_by(window) |>
+  slice_head(n = top_n_cited) |>
+  ungroup()
+
+# Filter original data based on the top_to_values
+data <- data |>
+  inner_join(top_to_values, by = c("to", "year" = "window")) |>
+  select(-n)
+
+cat("Found", as.character(nrow(data)), "items.", fill = TRUE)
+
+cat("Within these, limit to the", as.character(top_n_citing), "most citing authors...", fill = TRUE)
+
+# Find the top 10 most-occurring `from` values for each unique `to` value
+top_from_per_to <- data |>
+  group_by(to, from) |>
+  summarise(n = n(), .groups = 'drop') |>
+  arrange(to, desc(n)) |>
+  group_by(to) |>
+  slice_head(n = top_n_citing) |>
+  ungroup()
+
+# Filter the dataframe to include only rows with the top 10 most-occurring `from` values per unique `to` value
+data <- data |>
+  inner_join(top_from_per_to, by = c("from", "to")) |>
+  select(-n)
+
+cat("Found", as.character(nrow(data)), "items.",fill=TRUE)
+
+cat("Create network and activation data...",fill=TRUE)
+
+# filter the complete list of vertices to the ones contained in the edge list and add a new index
+vertex_ids <- unique(c(data$from, data$to))
+vertices <- vertices |>
+  filter(id %in% vertex_ids) |>
+  arrange(id) |>
+  mutate(new_id = row_number())
+
+# Update the 'from' and 'to' columns in the data to match these new row indices
+data <- data |>
+  left_join(vertices, by = c("from" = "id")) |>
+  select(-from) |>
+  rename(from = new_id) |>
+  left_join(vertices, by = c("to" = "id")) |>
+  select(-to) |>
+  rename(to = new_id) |>
+  select(from, to, year, count)
+
+# Create the edges data
+edges <- data |>
+  select(from, to) |>
+  unique()
+
+# Create the vertex attributes
+vertex_attr <- list(name = vertices$name)
+
+# Create the network
+net <- network(matrix(c(edges$from, edges$to), ncol = 2),
+               directed = TRUE,
+               loops = FALSE,
+               vertex.attr = vertex_attr,
+               vertices = nrow(vertices))
+network.vertex.names(net) <- vertices$name
+
+cat("Computing dynamic network...",fill=TRUE)
+
+# Create edge spells with columns [onset, terminus, tail, head]
+edge_spells <- data |>
+  mutate(onset = year, terminus = year, tail = from, head = to)  |>
+  select(onset, terminus, tail, head) |>
+  as.data.frame()
+
+# Create vertex spells with columns [onset, terminus, vertex_id]
+# Find the first (min) and last (max) time each vertex is mentioned and add a spell for all years in between
+vertex_spells <- edge_spells |>
+  pivot_longer(
+    cols = c(tail, head),
+    names_to = "temp_col",
+    values_to = "vertex_id"
+  ) |>
+  select(onset, terminus, vertex_id) |>
+  group_by(vertex_id) |>
+  summarise(
+    onset = min(onset),
+    terminus = max(terminus)
+  ) |>
+  ungroup() |>
+  rowwise() |>
+  summarise(
+    onset = list(seq(from = onset, to = terminus, by = 1)),
+    vertex_id = vertex_id
+  ) |>
+  unnest(onset) |>
+  arrange(vertex_id, onset) |>
+  mutate(onset = as.integer(onset)) |>
+  mutate(terminus = onset) |>
+  select(onset, terminus, vertex_id) |>
+  as.data.frame()
+
+dynNet <- networkDynamic(net,
+               edge.spells = edge_spells,
+               vertex.spells = vertex_spells,
+               verbose = TRUE)
+
+cat("Rendering movie...",fill=TRUE)
+
+get_normalized_indegree <- function(slice) {
+  # Calculate in-degrees
+  in_degree_values <- degree(slice, gmode = "indegree")
+
+  # Normalize by the maximum in-degree
+  max_degree <- max(in_degree_values, na.rm = TRUE)
+  if (max_degree == 0) {
+    max_degree <- 1
+  }
+  normalized_in_degree <- 1 + 2 * (in_degree_values / max_degree)
+
+  # If all in-degree values are NA (for isolated nodes), set them to 0
+  if (all(is.na(normalized_in_degree))) {
+    normalized_in_degree <- rep(0, length(normalized_in_degree))
+  }
+  # Replace NAs with 0s
+  normalized_in_degree[is.na(normalized_in_degree)] <- 0
+
+  return(normalized_in_degree)
+}
+
+get_vertex_labels <- function(slice) {
+  in_degree_values <- degree(slice, gmode = "indegree")
+  hide_vertex_labels_ids <- which(in_degree_values < 3)
+  existing_labels <- if ("vertex.names" %in% list.vertex.attributes(slice)) {
+    get.vertex.attribute(slice, "vertex.names")
+  } else {
+    as.character(1:network.size(slice))
+  }
+  existing_labels[hide_vertex_labels_ids] <- ""
+  return(existing_labels)
+}
+
+
+# Create the plot parameter list
+plot_params <- list(
+  vertex.cex = get_normalized_indegree,
+  label = get_vertex_labels,
+  label.cex = get_normalized_indegree,
+  main="Network of most-cited authors with most-citing authors (Source: JLS dataset)",
+  displaylabels=TRUE)
+
+d3_options <- list( animationDuration=2000)
+
+render.d3movie(dynNet,
+               plot.par = plot_params,
+               d3.options = d3_options,
+               frame.duration = 5000,
+               filename = "figure/jls-most-cited-most-citing-movie.html",
+               verbose = TRUE)
diff --git a/r-shiny-server/jls-search-authors.R b/r-shiny-server/jls-search-authors.R
new file mode 100644
index 0000000..36118eb
--- /dev/null
+++ b/r-shiny-server/jls-search-authors.R
@@ -0,0 +1,38 @@
+library(shiny)
+library(visNetwork)
+
+# Sample data
+nodes <- data.frame(id = 1:5, label = c("Node 1", "Node 2", "Node 3", "Node 4", "Node 5"))
+edges <- data.frame(from = c(1, 2, 3, 4), to = c(2, 3, 4, 5))
+
+# UI
+ui <- fluidPage(
+  textInput("searchBox", "Search Node: "),
+  visNetworkOutput("network")
+)
+
+# Server logic
+server <- function(input, output, session) {
+  # Create a reactive expression based on the search input
+  reactive_network_data <- reactive({
+    search_text <- input$searchBox
+    if (search_text == "") {
+      # Empty network
+      list(nodes = data.frame(), edges = data.frame())
+    } else {
+      # Filter nodes and edges based on search criteria
+      filtered_nodes <- nodes[grep(search_text, nodes$label, ignore.case = TRUE),]
+      filtered_edges <- edges[edges$from %in% filtered_nodes$id | edges$to %in% filtered_nodes$id, ]
+      list(nodes = filtered_nodes, edges = filtered_edges)
+    }
+  })
+
+  # Render network
+  output$network <- renderVisNetwork({
+    network_data <- reactive_network_data()
+    visNetwork(network_data$nodes, network_data$edges)
+  })
+}
+
+# Run the app
+shinyApp(ui, server)
-- 
GitLab