Commit 55ce03aa authored by Alessio Quaresima's avatar Alessio Quaresima
Browse files

merged

parents 41b512b2 abd58e6b
build
DOC
TIMIT
build
__pycache__
#this file is part of litwin-kumar_doiron_formation_2014
#Copyright (C) 2014 Ashok Litwin-Kumar
#see README for more information
#==
This file looks into the poperties of the SpikeTimit dataset.
I will:
1. Import the train set.
2. Select a subspace of the dataset with fixed:
regional accent (dr1)
gender (f)
set of words ()
set of words (["that", "had", "she", "me", "your", "year"])
3. Show the raster plot of a single realization of these words in the dataset
4. Train a classifier to distinguish between these words on a reduced feature space.
5. Expand the dataset to 4 regional accents and both genders and
extract the time of the phones that compose the word "year"
6. Train a classifier to distinguish between these phones on a reduced feature space.
7. Extract an average map of these phones.
==#
using Plots
using Random
using MLDataUtils
using MLJLinearModels
using StatsBase
## 1
include("SpikeTimit.jl")
test_path = joinpath(@__DIR__,"Spike TIMIT", "test");
train_path = joinpath(@__DIR__,"Spike TIMIT", "train");
dict_path = joinpath(@__DIR__,"DOC", "TIMITDIC.TXT");
train = SpikeTimit.create_dataset(;dir= train_path)
test = SpikeTimit.create_dataset(;dir= test_path)
dict = SpikeTimit.create_dictionary(file=dict_path)
## 2
Dong_words = ["that", "had", "she", "me", "your", "year"]
speakers = []
for word in Dong_words
speaker = @linq train |>
where(:dialect .== "dr1", :gender.=='f', word . :words) |>
select(:speaker) |> unique
push!(speakers,Set(speaker.speaker))
end
speakers = collect(intersect(speakers...))
single_speaker_train = @where(train,:speaker.==speakers[1])
## 3
spikes = []
plots = []
for word in Dong_words
spiketimes, duration, phones = SpikeTimit.get_word_spiketimes(;df=single_speaker_train, word=word)
@show typeof(spiketimes)
spiketimes = SpikeTimit.resample_spikes(;spiketimes=spiketimes, n_feat=4)
push!(spikes,spiketimes[1])
rst = SpikeTimit.raster_plot(spiketimes[1])
plot(rst.plt, title=word)
push!(plots,rst.plt)
end
plot(plots...)
spiketimes[1]
## 4. Compute average time of each word
data = zeros(6,length(spikes[1]))
for st in eachindex(spikes)
for n in eachindex(spikes[st])
if !isempty(spikes[st][n])
data[st,n] =mean(spikes[st][n])
end
end
#
end
plots = []
for n = 1:6
plt = scatter(data[n,:], label=false)
push!(plots,plt)
end
plot(plots...)
## 4.bis Compute the distinguishability
train_dataset = zeros(size(data)[2], 1000)
labels= zeros(Int,1000)
for x in 1:1000
n = rand(1:6)
train_dataset[:,x] .= data[n,:]
labels[x] = n
end
train_dataset
labels
MultinomialLogisticRegression(train_dataset,labels)
## 4 Define classifier
function make_set_index(y::Int64, ratio::Float64)
train, test = Vector{Int64}(), Vector{Int64}()
ratio_0 = length(train)/y
for index in 1:y
# if !(index ∈ train)
if rand()< ratio - ratio_0
push!(train,index)
else
push!(test,index)
end
end
return train, test
end
function labels_to_y(labels)
_labels = collect(Set(labels))
z = zeros(Int,maximum(_labels))
z[_labels] .= 1:length(_labels)
return z[labels]
end
function MultinomialLogisticRegression(X::Matrix{Float64},labels::Array{Int64}; λ=0.5::Float64, test_ratio=0.5)
n_classes = length(Set(labels))
while length(labels) >size(X,2)
pop!(labels)
end
y = labels_to_y(labels)
n_features = size(X,1)
train, test = make_set_index(length(y),test_ratio)
@show length(train)
train_std = StatsBase.fit(ZScoreTransform, X[:, train], dims=2)
StatsBase.transform!(train_std,X)
intercept = false
train_dataset[isnan.(train_dataset)].=0
# deploy MultinomialRegression from MLJLinearModels, λ being the strenght of the reguliser
mnr = MultinomialRegression(λ; fit_intercept=intercept)
# Fit the model
θ = MLJLinearModels.fit(mnr, X[:,train]', y[train])
# # The model parameters are organized such we can apply X⋅θ, the following is only to clarify
# Get the predictions X⋅θ and map each vector to its maximal element
# return θ, X
preds = MLJLinearModels.softmax(MLJLinearModels.apply_X(X[:,test]',θ,n_classes))
targets = map(x->argmax(x),eachrow(preds))
#and evaluate the model over the labels
scores = mean(targets .== y[test])
params = reshape(θ, n_features +Int(intercept), n_classes)
return scores, params
end
## Select a subset of regional dialect and the word "year".
single_dialect = @where(train,occursin.(:dialect, "dr1"*"dr2"*"dr3"), word . :words)
df = SpikeTimit.find_word(word="year",df=single_dialect)
## Get the time intervals of all the phones contained in the word, within the dataset
phones = get_phones_in_word(;df=df, word=word)
spikes = SpikeTimit.get_spiketimes(;df)
SpikeTimit.resample_spikes(;spiketimes=spikes, n_feat=2)
phone_labels, spiketimes = get_phones_spiketimes(spikes, phones)
## Compute the mean for each phone and set the classification stage
get_mean(x) = isempty(x) ? 0. : mean(x)
data = map(neuron -> get_mean.(neuron), spiketimes)
all_labels = collect(Set(phone_labels))
train_dataset = zeros(length(data[1]), 1000)
labels= zeros(Int,1000)
for x in 1:1000
n = rand(1:length(data))
phone = phone_labels[n]
train_dataset[:,x] .= data[n]
labels[x] = findfirst(l-> phone .== l, all_labels)
end
MultinomialLogisticRegression(train_dataset,labels)
## Show the average qualities of all the phones
plots=[]
_labels=[]
for n = 1:length(all_labels)
realizations = findall(l-> all_labels[n] .== l, phone_labels)
if length(realizations) > 5
traceplot = SpikeTimit.TracePlot(length(spiketimes[1]), alpha=1/length(realizations))
plot(traceplot.plt, title=all_labels[n])
push!(plots, traceplot)
push!(_labels, all_labels[n])
end
end
plots
# plots = repeat([traceplot],length(all_labels))
for (_l, ax) in zip(_labels, plots)
realizations = findall(l-> _l .== l, phone_labels)
@show length(realizations)
for r in realizations
SpikeTimit.raster_plot(spiketimes[r], ax=ax, alpha=0.1)
end
end
plot([p.plt for p in plots]..., legend=false, alpha=0.0001)
include("../src/SpikeTimit.jl")
# path = "/home/cocconat/Documents/Research/phd_project/speech/litwin-kumar_model_thesis/Spike TIMIT"
path = "/home/alequa/Documents/Research/phd_project/speech/Spike TIMIT/"
#Create the path strings leading to folders in the data set
test_path = joinpath(path, "test");
train_path = joinpath(path, "train");
dict_path = joinpath(path, "DOC/TIMITDIC.TXT");
train = SpikeTimit.create_dataset(;dir= train_path)
test = SpikeTimit.create_dataset(;dir= test_path)
dict = SpikeTimit.create_dictionary(file=dict_path)
##
# Parameters to compute the input_data
target_dialects = [1,2,5,8]
target_gender = "f" # "fm" "m"
samples = 10
n_speakers = 1
repetitions = 75 # amount of times you present the network with each unique stimulus.
silence_time = 0.15 # in seconds
n_features = 10 # number of features combined from input frequencies
words = ["that", "she"]
## Select a subset of the whole dataset.
# In this case I select all the female speakers from
# regional accent 1 that use at least on of the words in their
## I declared these functions because I am don't know how to use the Dataframes quesry properly *_*...
using DataFramesMeta
in_words(df_words) = !isempty(intersect(Set(df_words),Set(words)))
in_dialect(df_dialect) = df_dialect target_dialects
in_gender(df_gender) = occursin(df_gender, target_gender)
# this is a DataFrameMeta macro
speaker = @where(train,in_dialect.(:dialect), in_gender.(:gender), in_words.(:words))
## Select the inputs
durations, spikes, labels = SpikeTimit.select_inputs(df=speaker, words=words, samples = samples, n_feat = n_features);
##
## Mix them, if you like you can mix differently. Look at the function, it's simple!
all_ft, all_n, words_t, phones_t = SpikeTimit.mix_inputs(;durations=durations, spikes=spikes, labels=labels, repetitions=repetitions, silence_time)
SpikeTimit.convert_to_dt(words_t, 0.1)
SpikeTimit.convert_to_dt(phones_t, 0.1)
all_ft = SpikeTimit.convert_to_dt(all_ft, 0.1)
##
words_savepoints = SpikeTimit.get_savepoints(trans= words_t, n_measure=10)
ph_savepoints = SpikeTimit.get_savepoints(trans= phones_t, n_measure=10)
phones_t
## Comparing the last firing time, the duration of all words and the
## intervals of the words and phonemes we expect that it's well done!
all_ft[end]
input_length = repetitions*(sum(durations)+(silence_time*(length(durations))))-silence_time
words_t.intervals[end]
phones_t.steps[end]
using PyCall
using DataFrames
using DataFramesMeta
using Pandas
cd(joinpath(@__DIR__,".."))
py"""
import sys
import os
sys.path.insert(0, os.getcwd())
print(sys.path)
"""
TIMIT = pyimport("TIMIT_loader")
pyimport("importlib")."reload"(TIMIT)
#path = "C:\\Users\\leoni\\Desktop\\3rd_year_AI\\1_Thesis\\litwin-kumar_model_thesis\\Spike TIMIT"
path = "/home/cocconat/Documents/Research/phd_project/speech/litwin-kumar_model_thesis/Spike TIMIT"
dataset = TIMIT.create_dataset(joinpath(path,"train"))
spkrinfo, spkrsent = TIMIT.create_spkrdata(path)
# dataset |> Pandas.DataFrame |> DataFrames.DataFrame
##
include("../src/SpikeTimit.jl")
#Create the path strings leading to folders in the data set
test_path = joinpath(path, "test");
train_path = joinpath(path, "train");
dict_path = joinpath(path, "DOC/TIMITDIC.TXT");
train = SpikeTimit.create_dataset(;dir= train_path)
test = SpikeTimit.create_dataset(;dir= test_path)
dict = SpikeTimit.create_dictionary(file=dict_path)
##
words = ["that"]
target_dialects = [1]
target_gender = "f" # "fm" "m"
in_words(df_words) = !isempty(intersect(Set(df_words),Set(words)))
in_dialect(df_dialect) = df_dialect target_dialects
in_gender(df_gender) = occursin(df_gender, target_gender)
# this is a DataFrameMeta macro
speaker = @where(train,in_dialect.(:dialect), in_gender.(:gender), in_words.(:words))
speaker.words
words = TIMIT.get_spectra(speaker |> Pandas.DataFrame, target_words=["that"])
##
##
words[1].phones[1].db
##
using StatsBase
function py2j_words(words)
jwords = []
for word in words
phs = []
for ph in word.phones
push!(phs,SpikeTimit.Phone(ph.ph, ph.t0, ph.t1, Array{Float64}(ph.osc), Matrix{Float64}(ph.db)))
end
push!(jwords,SpikeTimit.Word(word.word, phs, word.duration, word.t0, word.t1))
end
return jwords
end
words =py2j_words(words)
function rate_coding_word(word::SpikeTimit.Word)
times = []
encoding = Matrix{Float64}(undef, 20, length(word.phones))
for (n,ph) in enumerate(word.phones)
encoding[:,n] = mean(ph.db, dims=2)[:,1]
push!(times, ph.t0 - word.t0)
end
return times, encoding
end
using Plots
times, phs = rate_coding_word(words[1])
a = heatmap(words[1].phones[1].db)
b = heatmap(words[1].phones[2].db)
c = heatmap(words[1].phones[3].db)
words[1].word
Plots.plot(a,b,c, layout=(1,3), colorbar=false, axes=nothing, ticks=nothing)
times, phs = rate_coding_word(words[9])
heatmap(phs)
words[1].phones[1].ph
## import the dataset. Notice, the spike-time are not imported, only the files are stored and ready to be read.
include("src/SpikeTimit.jl")
using .SpikeTimit
using Plots
basedir = "/home/cocconat/Documents/Research/phd_project/speech/litwin-kumar_model_thesis/Spike TIMIT"
test_path = joinpath(basedir, "test" )
train_path = joinpath(basedir,"train" )
dict_path = joinpath(basedir,"DOC","TIMITDIC.TXT")
dict = SpikeTimit.create_dictionary(file=dict_path)
train = SpikeTimit.create_dataset(;dir= train_path)
test = SpikeTimit.create_dataset(;dir= test_path)
## function get_spiketimes(;df, word)
my_sent = train[1,:]
my_sent.words
#we want the dark word
spikes = SpikeTimit.get_spiketimes(;df=my_sent)
new_spikes = SpikeTimit.resample_spikes(spike_times=spikes[1],n_feat=8)
histogram(vcat(new_spikes...),bins=0:0.01:3 )
## Verify spike resampling is correct
z = 0
for neurons in spikes[1]
z += length(neurons)
end
y = 0
for neurons in new_spikes
y += length(neurons)
end
@show z,y
## Count number of spikes
my_sent.words[:,3]
counter = 0
for x in new_spiketimes
counter += length(x)
end
counter
##
using .SpikeTimit
project= SpikeTimit.find_word(; word="project", df=train)
intervals = SpikeTimit.get_interval_word(;df=project, word="project")
spiketimes= SpikeTimit.get_spiketimes(; df=project)
vcat(spiketimes[1]...)
h = histogram(vcat(spiketimes[1]...),bins=0:0.01:3 )
##
for spike in spiketimes
# spikes = SpikeTimit.resample_spikes(spike_times=spikes,n_feat=8)
h = histogram(vcat(spike...),bins=0:0.01:3 )
push!(plots,h)
end
plot(plots...)
sts, durations = SpikeTimit.get_spikes_in_interval(;spiketimes=spiketimes, intervals=intervals)
SpikeTimit.stack_spiketimes(sts, durations, 0.1)
isempty(Matrix{Float64}(undef,0,0))
intervals[1]
##
prompts = joinpath(@__DIR__,"DOC","PROMPTS.TXT")
function get_sentences(;prompts)
dictionary=Dict()
for line in readlines(prompts)
if !startswith(line,";")
words = split(line, " ")
for word in words
a = lowercase(replace(word,[',','.',';']=>""))
if !haskey(dictionary,a)
push!(dictionary, a=>1)
else
dictionary[a]+=1
end
end
end
end
return dictionary
end
words = get_sentences(prompts=prompts)
sort(words, rev=true, by=values)
sort(collect(words), by=x->x[2])
# Select a subset of the dataset. Here for convenience I created a query to find all the sentences that contain the word "spring" in the train set.
# You can look up at the query and do others on the base of the DataFrame style. I suggest you to use the @linq macro and read the documentation carefully.
d_word = SpikeTimit.find_word(word="spring", df=train)
# Obviously, you can also choose the sentences by selecting some specific rows. Careful, the dataset has not an ordering.
d_number = train[1,:]
# Once you have sub-selected some columns, you can import the spike times.
# they are already corrected as explained in the PDF
spikes= SpikeTimit.get_spiketimes(df=d_number)
## Measure the spike frequency on a sample
counter = 0
for x in spikes[1]
counter += length(x)
end
duration = d_number[:phones] |> phones->map(phone->phone[end], phones)
## Data from LDK
jex =1.78
rx_n = 4.5e3
g_noise = jex*rx_n
spike_per_pop = counter/620/2.92 ## Hz per population
pmemb = 0.2
jex_s = 50
g_signal = spike_per_pop *620*pmemb*jex_s
## Now let's convert the spike-times into a convenient input.
test_path = "/run/media/cocconat/data/speech_encoding/Spike TIMIT/Spike TIMIT/test"
test = SpikeTimit.create_dataset(;dir= test_path)
## select two samples
d_number = test[1:2,:]
spikes= SpikeTimit.get_spiketimes(df=d_number)
## Show how it works for a single sample
dictionary = SpikeTimit.reverse_dictionary(spikes[1], 0.1)
sorted, neurons = SpikeTimit.sort_spikes(dictionary)
## check the length of these array is the same
all_ft, all_n = SpikeTimit.inputs_from_df(test, [1,2])
## This is the loop in the simulation, place it in the input section
firing_index= 1
next_firing_time= all_ft[firing_index]
for tt in 1:10000
if tt == next_firing_time
firing_neurons = all_n[firing_index]
firing_index +=1
next_firing_time = all_ft[firing_index]
println("At time step: ", tt)
println("These neurons fire: ", firing_neurons)
end
end
using Plots
plot(length.(all_n))
### A Pluto.jl notebook ###
# v0.12.21
using Markdown
using InteractiveUtils
# ╔═╡ 70951726-8063-11eb-2ba9-1fa3822b9e91
using .SpikeTimit
# ╔═╡ c0d57c40-8010-11eb-3004-195f0590db26
md"""
This notebook will show how to import data from SpikeTimit[1] database and run it as a stimulus for the LKD network[2]. The SpikeTimit.jl module import the standard daataset released with the publication [1].
"""
# ╔═╡ 62beb3c0-8011-11eb-1ab5-8de2f870d0b2
md""" Import the module and the relevant packages"""
# ╔═╡ 8015ec4a-8011-11eb-04d6-9bcd09fada86
PATH = joinpath(@__DIR__,"SpikeTimit.jl")
# ╔═╡ 5e4f6080-8063-11eb-39b1-ab78ccdbf423
include(PATH)
# ╔═╡ 24e8c3e2-8064-11eb-23dc-2f6848c499e5
# ╔═╡ 693fb58a-8063-11eb-3e59-c9249009d1d6
# ╔═╡ 4566f194-8012-11eb-39d7-ad467788a78b
md"""
Import the dataset. Notice, the spike-time are not imported, only the files are stored and ready to be read. You have to set your PATH fort the dataset
"""
# ╔═╡ 42f1c31e-8063-11eb-0059-75ffb8aa555a
begin
test_path = joinpath(@__DIR__,"Spike TIMIT", "test" );
train_path = joinpath(@__DIR__,"Spike TIMIT", "train" );
dict_path = joinpath(@__DIR__,"DOC","TIMITDIC.TXT");
end
# ╔═╡ 39885efc-8064-11eb-071d-c1eaa5f8892b
# ╔═╡ d3e653d0-8063-11eb-15e8-019ae2ff331a
md""" dict is a list of all words with the correesponding phones."""
# ╔═╡ 204518b2-8012-11eb-09f7-1f5da1ba4e1d
begin
train = SpikeTimit.create_dataset(;dir= train_path)
test = SpikeTimit.create_dataset(;dir= test_path)
dict = SpikeTimit.create_dictionary(file=dict_path)
end