Commit 004f39c9 authored by alessio.quaresima's avatar alessio.quaresima
Browse files

qMerge branch 'master' of gitlab.socsci.ru.nl:alessio.quaresima1/spiketimit.jl

parents 25733b47 e84476fd
build
DOC DOC
TIMIT
build build
__pycache__ __pycache__
This diff is collapsed.
include("src/SpikeTimit.jl") include("../src/SpikeTimit.jl")
path = "/home/cocconat/Documents/Research/phd_project/speech/litwin-kumar_model_thesis/Spike TIMIT" # path = "/home/cocconat/Documents/Research/phd_project/speech/litwin-kumar_model_thesis/Spike TIMIT"
path = "/home/alequa/Documents/Research/phd_project/speech/Spike TIMIT/"
#Create the path strings leading to folders in the data set #Create the path strings leading to folders in the data set
test_path = joinpath(path, "test"); test_path = joinpath(path, "test");
...@@ -24,6 +25,7 @@ words = ["that", "she"] ...@@ -24,6 +25,7 @@ words = ["that", "she"]
# In this case I select all the female speakers from # In this case I select all the female speakers from
# regional accent 1 that use at least on of the words in their # regional accent 1 that use at least on of the words in their
## I declared these functions because I am don't know how to use the Dataframes quesry properly *_*... ## I declared these functions because I am don't know how to use the Dataframes quesry properly *_*...
using DataFramesMeta
in_words(df_words) = !isempty(intersect(Set(df_words),Set(words))) in_words(df_words) = !isempty(intersect(Set(df_words),Set(words)))
in_dialect(df_dialect) = df_dialect target_dialects in_dialect(df_dialect) = df_dialect target_dialects
in_gender(df_gender) = occursin(df_gender, target_gender) in_gender(df_gender) = occursin(df_gender, target_gender)
...@@ -40,10 +42,11 @@ all_ft, all_n, words_t, phones_t = SpikeTimit.mix_inputs(;durations=durations, s ...@@ -40,10 +42,11 @@ all_ft, all_n, words_t, phones_t = SpikeTimit.mix_inputs(;durations=durations, s
SpikeTimit.convert_to_dt(words_t, 0.1) SpikeTimit.convert_to_dt(words_t, 0.1)
SpikeTimit.convert_to_dt(phones_t, 0.1) SpikeTimit.convert_to_dt(phones_t, 0.1)
all_ft = SpikeTimit.convert_to_dt(all_ft, 0.1) all_ft = SpikeTimit.convert_to_dt(all_ft, 0.1)
##
words_savepoints = SpikeTimit.get_savepoints(trans= words_t, n_measure=10) words_savepoints = SpikeTimit.get_savepoints(trans= words_t, n_measure=10)
ph_savepoints, ll = SpikeTimit.get_savepoints(trans= phones_t, n_measure=10) ph_savepoints = SpikeTimit.get_savepoints(trans= phones_t, n_measure=10)
phones_t
## Comparing the last firing time, the duration of all words and the ## Comparing the last firing time, the duration of all words and the
......
...@@ -10,11 +10,12 @@ import os ...@@ -10,11 +10,12 @@ import os
sys.path.insert(0, os.getcwd()) sys.path.insert(0, os.getcwd())
print(sys.path) print(sys.path)
""" """
pwd()
TIMIT = pyimport("TIMIT_loader") TIMIT = pyimport("TIMIT_loader")
pyimport("importlib")."reload"(TIMIT) pyimport("importlib")."reload"(TIMIT)
path = "C:\\Users\\leoni\\Desktop\\3rd_year_AI\\1_Thesis\\litwin-kumar_model_thesis\\Spike TIMIT" #path = "C:\\Users\\leoni\\Desktop\\3rd_year_AI\\1_Thesis\\litwin-kumar_model_thesis\\Spike TIMIT"
path = "/home/cocconat/Documents/Research/phd_project/speech/litwin-kumar_model_thesis/Spike TIMIT"
dataset = TIMIT.create_dataset(joinpath(path,"train")) dataset = TIMIT.create_dataset(joinpath(path,"train"))
spkrinfo, spkrsent = TIMIT.create_spkrdata(path) spkrinfo, spkrsent = TIMIT.create_spkrdata(path)
...@@ -34,52 +35,20 @@ dict = SpikeTimit.create_dictionary(file=dict_path) ...@@ -34,52 +35,20 @@ dict = SpikeTimit.create_dictionary(file=dict_path)
## ##
# SETTING PARAMETERS words = ["that"]
# Parameters to change to filter the data target_dialects = [1]
measurements_per_word = 1; # how often are the states stored per word target_gender = "f" # "fm" "m"
measurements_per_phone = 10; # how often are the states stored per phone in_words(df_words) = !isempty(intersect(Set(df_words),Set(words)))
timestep = 0.1; in_dialect(df_dialect) = df_dialect target_dialects
spikes_per_burst_increase = -1 in_gender(df_gender) = occursin(df_gender, target_gender)
samples_per_word = 25
repetitions = 2 # amount of times you present the network with each unique stimulus. # this is a DataFrameMeta macro
silence_time = 0.15 # in seconds speaker = @where(train,in_dialect.(:dialect), in_gender.(:gender), in_words.(:words))
n_features = 1 # number of features combined from input frequencies speaker.words
random_seed = 10 words = TIMIT.get_spectra(speaker |> Pandas.DataFrame, target_words=["that"])
nbr_of_pop = -1
words = ["that", "had", "she", "me", "your", "all", "like", "don't", "year", "water", "dark", "rag", "oily", "wash", "ask", "carry", "suit"]
target_dialects = "dr1 dr2 dr3 dr4 dr5 dr6 dr7 dr8"
target_gender = "f m"
# FILTERING THE DATAFRAME AND SELECTING WORDS
speakers = []
for word in words
speaker = @linq train |> # SET THIS TO TRAIN/TEST ACCORDINGLY
where(occursin.(:dialect,target_dialects), occursin.(:gender,target_gender), word . :words) |>
select(:speaker) |> unique
push!(speakers,Set(speaker.speaker))
end
speakers = collect(intersect(speakers...))
filtered_df = filter(:speaker=> x->x speakers, train) # This is the filtering of the dataframe where you only select the speakers you want.
include("../src/SpikeTimit.jl")
speaker = SpikeTimit.select_inputs(df=filtered_df, words=words, samples = samples_per_word, n_feat = n_features, random_seed=random_seed);
py_words = []
for i in 1:length(words)
push!(py_words, TIMIT.get_spectra(speaker[i] |> Pandas.DataFrame, target_words=words[i]));
end
words = [(py_words...)...]
word_labels = []
for i in 1:size(words,1)
push!(word_labels, words[i].word)
end
word_labels
## ##
## ##
#words[1].phones[1].db words[1].phones[1].db
## ##
using StatsBase using StatsBase
...@@ -94,139 +63,25 @@ function py2j_words(words) ...@@ -94,139 +63,25 @@ function py2j_words(words)
end end
return jwords return jwords
end end
jwords =py2j_words(words) words =py2j_words(words)
function rate_coding_word(word::SpikeTimit.Word) function rate_coding_word(word::SpikeTimit.Word)
times = [] times = []
durations = []
phone_labels = []
encoding = Matrix{Float64}(undef, 20, length(word.phones)) encoding = Matrix{Float64}(undef, 20, length(word.phones))
for (n,ph) in enumerate(word.phones) for (n,ph) in enumerate(word.phones)
encoding[:,n] = mean(ph.db, dims=2)[:,1] encoding[:,n] = mean(ph.db, dims=2)[:,1]
push!(times, ph.t0 - word.t0) push!(times, ph.t0 - word.t0)
push!(durations, ph.t1 - ph.t0)
push!(phone_labels, ph.ph)
end
return times, durations, phone_labels, encoding, word.duration
end
# VARIABLE FOR WORD PROPERTIES IN JULIA
times = []
durations = []
phone_labels = []
rates = []
word_durations = []
for word in jwords
t, d, pl, r, wd = rate_coding_word(word)
push!(times, t)
push!(durations, d)
push!(phone_labels, pl)
push!(rates, r)
push!(word_durations, wd)
end
word_start_times = zeros(size(word_durations,1))
word_end_times = copy(word_durations)
# NORMALIZING THE RATES
typeof(rates)
size(rates[2],2)
new_rates = copy(rates)
for (i, rate) in enumerate(rates)
for ph in 1:size(rate,2)
new_rates[i][:,ph] = (rate[:,ph]./sum(rate[:,ph])) .*8
end end
return times, encoding
end end
new_rates[1] # the rates to each population (rows) for each phone (columns) in the word, normalized to sum to 8
# REPEATING THE WORDS
all_times = repeat(times, repetitions)
all_durations = repeat(durations, repetitions)
all_rates = repeat(new_rates, repetitions)
all_word_labels = repeat(word_labels, repetitions)
all_phone_labels = repeat(phone_labels, repetitions)
all_word_durations = repeat(word_durations, repetitions)
all_word_start_times = repeat(word_start_times, repetitions)
all_word_end_times = repeat(word_end_times, repetitions)
# SHUFFLING THE WORDS
using Random
ind = shuffle(1:size(all_times,1))
all_times = all_times[ind,:]
all_durations = all_durations[ind,:] # durations of phones
all_rates = all_rates[ind,:]
all_word_labels = all_word_labels[ind,:]
all_phone_labels = all_phone_labels[ind,:]
all_word_durations = all_word_durations[ind,:]
all_word_start_times = all_word_start_times[ind,:]
all_word_end_times = all_word_end_times[ind,:]
# STACKING THE WORDS IN TIME
# Stacking starting times of phones
all_starting_times = copy(all_times)
global_time = 0.
for (w, time_w) in enumerate(all_times)
all_starting_times[w] = time_w .+ global_time
word_duration = sum(all_durations[w])
global_time += word_duration + silence_time
end
# Stacking starting and end times of words
global_time = 0.
for w in 1:size(all_word_start_times,1)
all_word_start_times[w] = all_word_start_times[w] .+ global_time
all_word_end_times[w] = all_word_end_times[w] .+ global_time
global_time += all_word_durations[w] + silence_time
end
# FLATTENING ARRAYS FOR STIM
flat_all_times = [(all_starting_times...)...]
flat_all_durations = [(all_durations...)...]
flat_all_rates = all_rates[1]
for i in 2:size(all_rates,1)
flat_all_rates = hcat(flat_all_rates, all_rates[i])
end
flat_all_phone_labels = [(all_phone_labels...)...]
# MAKING STIMULUS MATRIX
input_Npop = 20 # nbr of frequencies
target_pops = []
for j in 1:input_Npop
push!(target_pops, j)
end
target_populations = repeat(target_pops, convert(Int, (size(flat_all_times,1)/20)))
stim = zeros(size(flat_all_times,1),4)
for i in 1:size(flat_all_times,1)
stim[i,1] = target_populations[i] # population number of this phone
stim[i,2] = flat_all_times[i]*1000 # start of interval
stim[i,3] = (flat_all_times[i] + flat_all_durations[i])*1000 # end of interval
stim[i,4] = flat_all_rates[:,i][target_populations[i]]
end
# COMPUTING SAVEPOINTS
# End point for phone savepoints
flat_all_end_times = flat_all_times .+ flat_all_durations
word_savepoints = SpikeTimit.get_savepoints(all_start_times=all_word_start_times, all_end_times = all_word_end_times, n_measure = measurements_per_word, timestep=timestep)
phone_savepoints = SpikeTimit.get_savepoints(all_start_times=flat_all_times, all_end_times = flat_all_end_times, n_measure = measurements_per_phone, timestep=timestep)
# SAVING THE INFORMATION AS H5
# stim, phone labels, word labels, phone asvepoints, word savepoints, input npop, timestep
include("../src/ReadWrite.jl")
# Necessary conversions
all_word_labels = convert(Array{String}, all_word_labels)
ReadWrite.save_energy_encoding(stim, all_word_labels, flat_all_phone_labels, word_savepoints, phone_savepoints, input_Npop, timestep, "C:\\Users\\leoni\\Desktop\\3rd_year_AI\\1_Thesis\\litwin-kumar_model_thesis\\data")
# Plots using Plots
# using Plots times, phs = rate_coding_word(words[1])
# times, phs = rate_coding_word(words[1]) a = heatmap(words[1].phones[1].db)
# a = heatmap(words[1].phones[1].db) b = heatmap(words[1].phones[2].db)
# b = heatmap(words[1].phones[2].db) c = heatmap(words[1].phones[3].db)
# c = heatmap(words[1].phones[3].db) words[1].word
# words[1].word Plots.plot(a,b,c, layout=(1,3), colorbar=false, axes=nothing, ticks=nothing)
# Plots.plot(a,b,c, layout=(1,3), colorbar=false, axes=nothing, ticks=nothing) times, phs = rate_coding_word(words[9])
# times, phs = rate_coding_word(words[9]) heatmap(phs)
# heatmap(phs) words[1].phones[1].ph
# words[1].phones[1].ph
...@@ -228,7 +228,7 @@ def get_spectra(dataframe, target_words=[], cqt_p=BAE): ...@@ -228,7 +228,7 @@ def get_spectra(dataframe, target_words=[], cqt_p=BAE):
paths = dataframe.path paths = dataframe.path
if isinstance(dataframe.path,str): if isinstance(dataframe.path,str):
paths = [dataframe.path] paths = [dataframe.path]
#print(paths) print(paths)
for my_path in paths: for my_path in paths:
oscillogram, sr = librosa.load(my_path + ".wav") oscillogram, sr = librosa.load(my_path + ".wav")
...@@ -245,7 +245,7 @@ def get_spectra(dataframe, target_words=[], cqt_p=BAE): ...@@ -245,7 +245,7 @@ def get_spectra(dataframe, target_words=[], cqt_p=BAE):
duration = len(oscillogram) / sr duration = len(oscillogram) / sr
osc_sr = len(oscillogram) / duration osc_sr = len(oscillogram) / duration
db_sr = cqt.shape[1] / duration db_sr = cqt.shape[1] / duration
#print(final_time/duration, TRANSCRIPT_SR) print(final_time/duration, TRANSCRIPT_SR)
# %% # %%
words, word_times = [], [] words, word_times = [], []
......
### A Pluto.jl notebook ###
# v0.12.21
using Markdown
using InteractiveUtils
# ╔═╡ 70951726-8063-11eb-2ba9-1fa3822b9e91
using .SpikeTimit
# ╔═╡ c0d57c40-8010-11eb-3004-195f0590db26
md"""
This notebook will show how to import data from SpikeTimit[1] database and run it as a stimulus for the LKD network[2]. The SpikeTimit.jl module import the standard daataset released with the publication [1].
"""
# ╔═╡ 62beb3c0-8011-11eb-1ab5-8de2f870d0b2
md""" Import the module and the relevant packages"""
# ╔═╡ 8015ec4a-8011-11eb-04d6-9bcd09fada86
PATH = joinpath(@__DIR__,"SpikeTimit.jl")
# ╔═╡ 5e4f6080-8063-11eb-39b1-ab78ccdbf423
include(PATH)
# ╔═╡ 24e8c3e2-8064-11eb-23dc-2f6848c499e5
# ╔═╡ 693fb58a-8063-11eb-3e59-c9249009d1d6
# ╔═╡ 4566f194-8012-11eb-39d7-ad467788a78b
md"""
Import the dataset. Notice, the spike-time are not imported, only the files are stored and ready to be read. You have to set your PATH fort the dataset
"""
# ╔═╡ 42f1c31e-8063-11eb-0059-75ffb8aa555a
begin
test_path = joinpath(@__DIR__,"Spike TIMIT", "test" );
train_path = joinpath(@__DIR__,"Spike TIMIT", "train" );
dict_path = joinpath(@__DIR__,"DOC","TIMITDIC.TXT");
end
# ╔═╡ 39885efc-8064-11eb-071d-c1eaa5f8892b
# ╔═╡ d3e653d0-8063-11eb-15e8-019ae2ff331a
md""" dict is a list of all words with the correesponding phones."""
# ╔═╡ 204518b2-8012-11eb-09f7-1f5da1ba4e1d
begin
train = SpikeTimit.create_dataset(;dir= train_path)
test = SpikeTimit.create_dataset(;dir= test_path)
dict = SpikeTimit.create_dictionary(file=dict_path)
end
# ╔═╡ ae1af4c8-8011-11eb-0797-8d37104dcef5
md"""Select a subset of the dataset. Here for convenience I created a query to find all the sentences that contain the word "spring" in the train set.
ou can look up at the query and do others on the base of the DataFrame style. I suggest you to use the @linq macro and read the documentation carefully."""
# ╔═╡ f24740dc-8063-11eb-309d-e578fe133f5b
d_word = SpikeTimit.find_word(word="spring", df=train)
# ╔═╡ 4504b352-8011-11eb-0b7b-e97d88db44c9
md"""
Obviously, you can also choose the sentences by selecting some specific rows. Careful, the dataset has not an ordering.
"""
# ╔═╡ 40474e38-8011-11eb-0a99-917702896ff5
d_number = train[1,:]
# ╔═╡ 2ed1509a-8011-11eb-243e-e5df7b658803
md"""
Once you have sub-selected some columns, you can import the spike times. they are already corrected as explained in the PDF"""
# ╔═╡ 570de36a-8064-11eb-3bcb-076383a55a63
spikes= SpikeTimit.get_spiketimes(df=d_number)
# ╔═╡ 5c4b7dae-8064-11eb-1e77-b55a9c0588c0
plt1 = SpikeTimit.raster_plot(spikes[1])
plt1.plt
md"""
Also, the dataframe contains these fields:
speaker : the ID of the speaker
senID : the ID of the sentence
path : the path to access it (so you can retrieve the correct file adding the .EXT )
words : the words and their respective timing in ms
phones : the phones and their respective timing in ms
You can access it in this way:
"""
speakerID_firstsent = train[1,:speaker]
words_firstsent = train[1,:words]
# ╔═╡ 02a71dd8-8011-11eb-3f32-bb85b0c102f5
md"""
References
----------
[1] _Pan, Zihan, Yansong Chua, Jibin Wu, Malu Zhang, Haizhou Li, and Eliathamby Ambikairajah. “An Efficient and Perceptually Motivated Auditory Neural Encoding and Decoding Algorithm for Spiking Neural Networks.” Frontiers in Neuroscience 13 (2020). https://doi.org/10.3389/fnins.2019.01420._
[2] _Litwin-Kumar, Ashok, and Brent Doiron. “Formation and Maintenance of Neuronal Assemblies through Synaptic Plasticity.” Nature Communications 5, no. 1 (December 2014). https://doi.org/10.1038/ncomms6319._
"""
"""
Extract all the firing times and the corresponding neurons from an array with all the neurons and their relative firing times. i.e. the reverse_dictionary
"""
function reverse_dictionary(spikes)
all_times = Dict()
for n in eachindex(spikes)
if !isempty(spikes[n])
for t in spikes[n]
tt = round(Int,t*10000)
if haskey(all_times,tt)
all_times[tt] = [all_times[tt]..., n]
else
push!(all_times, tt=>[n])
end
end
end
end
return all_times
end
"""
From the reverse_dictionary data structure obtain 2 arrays that are faster to access in the simulation loop.
1. First array contains the sorted spike times.
2. The second array contains vectors with each firing neuron
"""
function sort_spikes(dictionary)
neurons = Array{Vector{Int}}(undef, length(keys(dictionary)))
sorted = sort(collect(keys(dictionary)))
for (n,k) in enumerate(sorted)
neurons[n] = dictionary[k]
end
return sorted, neurons
end
dictionary = reverse_spiketimes(spikes[1])
sorted, neurons = sort_spikes(dictionary)
## This will count all the firing event happened
firing_index= 1
next_firing_time= sorted[firing_index]
# This is the loop in the simulation
for tt in 1:10000
if tt == next_firing_time
firing_neurons = neurons[firing_index]
firing_index +=1
next_firing_time = sorted[firing_index]
println("At time step: ", tt)
println("These neurons fire: ", firing_neurons)
end
end
using Base
module ReadWrite
using Dates
using Printf
using Serialization
using HDF5
using OrderedCollections
using JLD
using DataFrames
include("SpikeTimit.jl")
#Reads the data from Spike TIMIT folder and returns the train and test set and dictionary
function read_data_set()
#Create the path strings leading to folders in the data set
test_path = joinpath(@__DIR__,"Spike TIMIT", "test");
train_path = joinpath(@__DIR__,"Spike TIMIT", "train");
dict_path = joinpath(@__DIR__,"DOC", "TIMITDIC.TXT");
#Create the data sets from the given path
train = SpikeTimit.create_dataset(;dir= train_path)
test = SpikeTimit.create_dataset(;dir= test_path)
dict = SpikeTimit.create_dictionary(file=dict_path)
return train, test, dict
end
#Creates a folder to save all the data from one simulation run. Creates data folder if it does not exist yet
function createfolder(; folder_name="")
if !(isdir("data"))
mkdir("data")
end
#=
The default folder to save to is simulation_run_1 (inside data folder)
If this folder exists it will keep incrementing the integer
at the end of the filename until a non-existing folder is found.
=#
if folder_name == ""
n = 1
folder_name = "data/simulation_run_"
folder = string(folder_name,n)
while (isdir(folder)) # while folder already exists create new foldername
n += 1
folder = string(folder_name,n)
end
else
folder = "data/"*folder_name
end
mkdir(folder)
mkdir(string(folder, "/weights"))
mkdir(string(folder, "/mean_weights"))
mkdir(string(folder, "/ie_weights"))
mkdir(string(folder, "/word_states"))
mkdir(string(folder, "/phone_states"))
return folder
end
# WEIGHTS
# Saves the array of network weights to an HDF5 file
function save_network_weights(W::Array, T::Int, rd::String)
filename = abspath(rd*"/weights/weights_T$T.h5") #absolute path #somehow the location gets weird for this one..
fid = h5open(filename,"w")
fid["weights"] = W
fid["tt"] = T
close(fid)
nothing
end
function read_network_weights(rd::String; cache=true)
files = Vector()
folder = rd*"/weights/"
for file_ in readdir(string(@__DIR__, folder))
if startswith(file_,"weights") && endswith(file_,"h5")
filename = string(@__DIR__, folder*file_)
h5open(filename,"r") do fid
tt = read(fid["tt"])
push!(files,(tt, filename))
end
end
sort!(files,by=x->x[1])
end
if cache
ws = Vector{Tuple{Int,Array}}()
for (tt, file_) in files
h5open(file_,"r") do file_
fid = read(file_)
W = get_weights(fid)
push!(ws,(tt, W))
end
end
return ws
else
# channel = Channel()
ws = Vector{Tuple{Int,String}}()
for (tt, file_) in files
push!(ws,(tt, file_))
end
return ws
end
end
# MEAN WEIGHTS
#Saves the array of the mean population weights to an HDF5 file
function save_network_mean_weights(mean_W::Array, T::Int, rd::String)
#filename = joinpath(@__DIR__, rd*"/weights/mean_weights_T$T.h5")
filename = abspath(rd*"/mean_weights/mean_weights_T$T.h5") #absolute path #absolute path #somehow the location gets weird for this one..
fid = h5open(filename,"w")
fid["weights"] = mean_W
fid["tt"] = T