From 5dee5c1eca1c8d804b6f1ea0634c65fcbe251696 Mon Sep 17 00:00:00 2001 From: Christian Boulanger <boulanger@lhlt.mpg.de> Date: Tue, 5 Mar 2024 09:41:22 +0100 Subject: [PATCH] add files from older experiments --- biographic-timelines/jls-editorial-board.rmd | 35 +++ .../output/editors-hf-llama2-13b-chat.csv | 5 + ...{hf_llama2_chat_gptq.py => hf_endpoint.py} | 2 +- networks-over-time/jls-network-over-time.R | 210 ++++++++++++++++++ r-shiny-server/jls-search-authors.R | 38 ++++ 5 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 biographic-timelines/jls-editorial-board.rmd create mode 100644 langchain-experiments/data/output/editors-hf-llama2-13b-chat.csv rename langchain-experiments/lib/{hf_llama2_chat_gptq.py => hf_endpoint.py} (95%) create mode 100644 networks-over-time/jls-network-over-time.R create mode 100644 r-shiny-server/jls-search-authors.R diff --git a/biographic-timelines/jls-editorial-board.rmd b/biographic-timelines/jls-editorial-board.rmd new file mode 100644 index 0000000..c295290 --- /dev/null +++ b/biographic-timelines/jls-editorial-board.rmd @@ -0,0 +1,35 @@ +--- +title: "People involved in the JLS" +output: html_notebook +--- + +```{r} +library(readxl) +library(dplyr) +library(timevis) +library(stringr) +library(htmlwidgets) + +vertical_stripes = "background: repeating-linear-gradient(to right, transparent, transparent 2px, lightgray 2px, lightgray 4px);" +diagonal_stripes = "background: repeating-linear-gradient(135deg, transparent, transparent 2px, lightgray 2px, lightgray 4px);" + +editors <- read_excel("data/jls-editors.xlsx") %>% + mutate( + content = name, + title = name, + start = paste(start, "-06-01", sep=""), + end = str_c(ifelse(is.na(end), "2025", end), "-06-01"), + style = case_when( + gender == "Male" ~ vertical_stripes, + gender == "Female" ~ "background-color: white", + TRUE ~ "background-color: white" + ) + ) +hw <- timevis(editors, + showZoom = FALSE, + width = "1500px", + options = list(start="1974-01-01", end = "2025-12-31", selectable=FALSE, showCurrentTime = FALSE)) + +saveWidget(hw, "docs/jls-editors.html") +unlink("docs/jls-editors_files", recursive = TRUE) +``` \ No newline at end of file diff --git a/langchain-experiments/data/output/editors-hf-llama2-13b-chat.csv b/langchain-experiments/data/output/editors-hf-llama2-13b-chat.csv new file mode 100644 index 0000000..68f7baa --- /dev/null +++ b/langchain-experiments/data/output/editors-hf-llama2-13b-chat.csv @@ -0,0 +1,5 @@ +Martinez,José,Professor +Busse,Regierungsdirektor +Ewald,Endres,Forst,Jurist +Glas,VolkmarNies,Koch +Note: Some positions have been omitted as they are not explicitly mentioned in the text. \ No newline at end of file diff --git a/langchain-experiments/lib/hf_llama2_chat_gptq.py b/langchain-experiments/lib/hf_endpoint.py similarity index 95% rename from langchain-experiments/lib/hf_llama2_chat_gptq.py rename to langchain-experiments/lib/hf_endpoint.py index cdd1a6f..9ecc1e8 100644 --- a/langchain-experiments/lib/hf_llama2_chat_gptq.py +++ b/langchain-experiments/lib/hf_endpoint.py @@ -18,7 +18,7 @@ def query(url, template, model_params = None, **params): "temperature": 0.1, "max_new_tokens": 2000 } - inputs = template.format_map(**params) + inputs = template.format_map(params) payload = { "inputs": inputs, "parameters": model_params diff --git a/networks-over-time/jls-network-over-time.R b/networks-over-time/jls-network-over-time.R new file mode 100644 index 0000000..8152a5f --- /dev/null +++ b/networks-over-time/jls-network-over-time.R @@ -0,0 +1,210 @@ +library(network) +library(networkDynamic) +library(tidyverse) +library(ndtv) + +# config +top_n_cited <- 20 +top_n_citing <- 20 +year_start <- 2000 +year_end <- 2010 +years_per_slice <- 5 + +cat("Importing data for",as.character(year_start),"-",as.character(year_end),fill=TRUE) +df <- read.csv("data/jls-author-network-owndata.csv", encoding = "UTF-8") |> + filter(pub_year >= year_start & pub_year <= year_end) |> + rename(from = citing_author) |> + rename(to = cited_author) |> + rename(year = pub_year) |> + rename(count = citation_count) + +# Create a lookup table for journal titles to IDs +names <- unique(c(df$from, df$to)) +vertices <- tibble(id = seq_along(names), name = names) + +# Convert source and target authors to ids +data <- df |> + left_join(vertices, by = c("from" = "name")) |> + select(-from) |> + rename(from = id) |> + left_join(vertices, by = c("to" = "name")) |> + select(-to) |> + rename(to = id) |> + select(from, to, year, count) |> + filter(from != to) # remove self-citations + +cat("Found", as.character(nrow(data)), "items.",fill=TRUE) + +cat("Determine the", as.character(top_n_cited), "most cited authors in ", + as.character(years_per_slice), "-year windows",fill=TRUE) + +# create sliding time windows +sliding_window <- function(year) { + seq(year - 2, year + 2) +} +tmp <- data |> + rowwise() |> + mutate(window = list(sliding_window(year))) |> + unnest(window) + +# Find the n most-cited items within each 5-year window +top_to_values <- tmp |> + group_by(window, to) |> + summarise(n = n(), .groups = 'drop') |> + arrange(window, desc(n)) |> + group_by(window) |> + slice_head(n = top_n_cited) |> + ungroup() + +# Filter original data based on the top_to_values +data <- data |> + inner_join(top_to_values, by = c("to", "year" = "window")) |> + select(-n) + +cat("Found", as.character(nrow(data)), "items.", fill = TRUE) + +cat("Within these, limit to the", as.character(top_n_citing), "most citing authors...", fill = TRUE) + +# Find the top 10 most-occurring `from` values for each unique `to` value +top_from_per_to <- data |> + group_by(to, from) |> + summarise(n = n(), .groups = 'drop') |> + arrange(to, desc(n)) |> + group_by(to) |> + slice_head(n = top_n_citing) |> + ungroup() + +# Filter the dataframe to include only rows with the top 10 most-occurring `from` values per unique `to` value +data <- data |> + inner_join(top_from_per_to, by = c("from", "to")) |> + select(-n) + +cat("Found", as.character(nrow(data)), "items.",fill=TRUE) + +cat("Create network and activation data...",fill=TRUE) + +# filter the complete list of vertices to the ones contained in the edge list and add a new index +vertex_ids <- unique(c(data$from, data$to)) +vertices <- vertices |> + filter(id %in% vertex_ids) |> + arrange(id) |> + mutate(new_id = row_number()) + +# Update the 'from' and 'to' columns in the data to match these new row indices +data <- data |> + left_join(vertices, by = c("from" = "id")) |> + select(-from) |> + rename(from = new_id) |> + left_join(vertices, by = c("to" = "id")) |> + select(-to) |> + rename(to = new_id) |> + select(from, to, year, count) + +# Create the edges data +edges <- data |> + select(from, to) |> + unique() + +# Create the vertex attributes +vertex_attr <- list(name = vertices$name) + +# Create the network +net <- network(matrix(c(edges$from, edges$to), ncol = 2), + directed = TRUE, + loops = FALSE, + vertex.attr = vertex_attr, + vertices = nrow(vertices)) +network.vertex.names(net) <- vertices$name + +cat("Computing dynamic network...",fill=TRUE) + +# Create edge spells with columns [onset, terminus, tail, head] +edge_spells <- data |> + mutate(onset = year, terminus = year, tail = from, head = to) |> + select(onset, terminus, tail, head) |> + as.data.frame() + +# Create vertex spells with columns [onset, terminus, vertex_id] +# Find the first (min) and last (max) time each vertex is mentioned and add a spell for all years in between +vertex_spells <- edge_spells |> + pivot_longer( + cols = c(tail, head), + names_to = "temp_col", + values_to = "vertex_id" + ) |> + select(onset, terminus, vertex_id) |> + group_by(vertex_id) |> + summarise( + onset = min(onset), + terminus = max(terminus) + ) |> + ungroup() |> + rowwise() |> + summarise( + onset = list(seq(from = onset, to = terminus, by = 1)), + vertex_id = vertex_id + ) |> + unnest(onset) |> + arrange(vertex_id, onset) |> + mutate(onset = as.integer(onset)) |> + mutate(terminus = onset) |> + select(onset, terminus, vertex_id) |> + as.data.frame() + +dynNet <- networkDynamic(net, + edge.spells = edge_spells, + vertex.spells = vertex_spells, + verbose = TRUE) + +cat("Rendering movie...",fill=TRUE) + +get_normalized_indegree <- function(slice) { + # Calculate in-degrees + in_degree_values <- degree(slice, gmode = "indegree") + + # Normalize by the maximum in-degree + max_degree <- max(in_degree_values, na.rm = TRUE) + if (max_degree == 0) { + max_degree <- 1 + } + normalized_in_degree <- 1 + 2 * (in_degree_values / max_degree) + + # If all in-degree values are NA (for isolated nodes), set them to 0 + if (all(is.na(normalized_in_degree))) { + normalized_in_degree <- rep(0, length(normalized_in_degree)) + } + # Replace NAs with 0s + normalized_in_degree[is.na(normalized_in_degree)] <- 0 + + return(normalized_in_degree) +} + +get_vertex_labels <- function(slice) { + in_degree_values <- degree(slice, gmode = "indegree") + hide_vertex_labels_ids <- which(in_degree_values < 3) + existing_labels <- if ("vertex.names" %in% list.vertex.attributes(slice)) { + get.vertex.attribute(slice, "vertex.names") + } else { + as.character(1:network.size(slice)) + } + existing_labels[hide_vertex_labels_ids] <- "" + return(existing_labels) +} + + +# Create the plot parameter list +plot_params <- list( + vertex.cex = get_normalized_indegree, + label = get_vertex_labels, + label.cex = get_normalized_indegree, + main="Network of most-cited authors with most-citing authors (Source: JLS dataset)", + displaylabels=TRUE) + +d3_options <- list( animationDuration=2000) + +render.d3movie(dynNet, + plot.par = plot_params, + d3.options = d3_options, + frame.duration = 5000, + filename = "figure/jls-most-cited-most-citing-movie.html", + verbose = TRUE) diff --git a/r-shiny-server/jls-search-authors.R b/r-shiny-server/jls-search-authors.R new file mode 100644 index 0000000..36118eb --- /dev/null +++ b/r-shiny-server/jls-search-authors.R @@ -0,0 +1,38 @@ +library(shiny) +library(visNetwork) + +# Sample data +nodes <- data.frame(id = 1:5, label = c("Node 1", "Node 2", "Node 3", "Node 4", "Node 5")) +edges <- data.frame(from = c(1, 2, 3, 4), to = c(2, 3, 4, 5)) + +# UI +ui <- fluidPage( + textInput("searchBox", "Search Node: "), + visNetworkOutput("network") +) + +# Server logic +server <- function(input, output, session) { + # Create a reactive expression based on the search input + reactive_network_data <- reactive({ + search_text <- input$searchBox + if (search_text == "") { + # Empty network + list(nodes = data.frame(), edges = data.frame()) + } else { + # Filter nodes and edges based on search criteria + filtered_nodes <- nodes[grep(search_text, nodes$label, ignore.case = TRUE),] + filtered_edges <- edges[edges$from %in% filtered_nodes$id | edges$to %in% filtered_nodes$id, ] + list(nodes = filtered_nodes, edges = filtered_edges) + } + }) + + # Render network + output$network <- renderVisNetwork({ + network_data <- reactive_network_data() + visNetwork(network_data$nodes, network_data$edges) + }) +} + +# Run the app +shinyApp(ui, server) -- GitLab