Commit d37606f5 authored by c.fortmanngrote's avatar c.fortmanngrote
Browse files

Merge branch 'java_updates' into frederic

parents 2b062f1d e6d8e91d
Pipeline #268741 failed with stages
in 2 minutes and 57 seconds
......@@ -38,6 +38,19 @@ add_custom_target(patch_blast ALL
# provide libRlapack requested by r-ape.
add_custom_target(link_lapack ALL
COMMAND ln -s ${CMAKE_INSTALL_PREFIX}/lib/R/modules/lapack.so ${CMAKE_INSTALL_PREFIX}/lib/libRlapack.so
COMMAND ln -sf ${CMAKE_INSTALL_PREFIX}/lib/R/modules/lapack.so ${CMAKE_INSTALL_PREFIX}/lib/libRlapack.so
)
# java code
ExternalProject_Add(repin_ecology
DOWNLOAD_COMMAND cp -ar ${PROJECT_SOURCE_DIR}/REPIN_ecology/REPIN_ecology/ . && pwd
CONFIGURE_COMMAND ""
BUILD_COMMAND cd ../REPIN_ecology && gradle build
INSTALL_COMMAND install ../REPIN_ecology/build/libs/REPIN_ecology.jar ${CMAKE_INSTALL_PREFIX}/lib/REPIN_ecology.jar
)
MIT License
Copyright (c) 2020 - 2021 Max Planck Institute for Evolutionary Biology
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# RepinPop
## Compilers and build system
## Installation
## Compilers, build system and dependencies.
The following packages (linux, debian based distro) are required:
* java (>=11)
* gcc (or alternativ C compiler)
* libgsl-dev
* andi
* build-essential
* cmake (>=3.11)
* phyloml
### Install dependencies on debian based linux distros (debian, *ubuntu, mint, ...)
```
sudo apt install linux-libc-dev util-linux git make gcc build-essential libgsl-dev gsl-bin andi wget zip unzip
sudo apt install linux-libc-dev util-linux git make gcc build-essential libgsl-dev gsl-bin andi wget zip unzip phyml
```
## Create the conda environment
### Create the conda environment
```
$> conda env create -n repinpop --file=environment.yml
......@@ -28,12 +33,9 @@ Activate the new environment:
$> conda activate repinpop
```
### Build and install cmake targets.
RAREFAN uses cmake to configure, build, and install most of its non-python dependencies including the RAREFAN java code and`clustDist` [Cluster Distances into Phylogenies](https://github.com/EvolBioInf/clustDist.git) for phylogenetic analysis. The installation target directory is the $CONDA_PREFIX directory.
## Install 3rd party libraries through cmake.
Not all dependencies are available on the conda archives. `clustDist` [Cluster Distances into Phylogenies](https://github.com/EvolBioInf/clustDist.git)
is handled by a the script `CMakeLists.txt` to be consumed by the `cmake` utility.
### Issues when building `clustDist` inside the conda env.
We observed that `clustDist` produces faulty results if compiled inside the conda environment. As a workaround, we recommend to build `clustDist` with deactivated conda environment. Nevertheless, we *install* `clustDist` and other build products into the conda environment.
Record the value of the `$CONDA_PREFIX` environment variable, e.g.
......@@ -49,54 +51,168 @@ conda deactivate
### Build in separate build directory.
```
$> mkdir build
$> cd build
$> cmake -DCMAKE_INSTALL_PREFIX=`cat conda_prefix.txt` ..
$> mkdir build
$> cd build
$> cmake -DCMAKE_INSTALL_PREFIX=$(cat ../conda_prefix.txt) ..
```
The last line instructs cmake to setup the `$CONDA_PREFIX` as the installation prefix for the third party libraries to be installed.
The last line instructs sets the installation prefix to $CONDA_PREFIX.
```
$> make
```
This will download the required source codes for all three dependencies, build,
and install the executables into the `conda` environment created in the first
This fetch, build,
and install the dependencies into the `conda` environment created in the first
step.
## Build the java code:
Change back into the project's root directory
```shell
cd ..
```
Activate the conda environment again:
### Set library path.
Some environment variables (in particular `LD_LIBRARY_PATH`) have to be set
explicitly.
```
conda activate repinpop
source setenv.sh
```
`RepinPop` requires at least java version 11. Building is done by
[`gradle`](https://gradle.org).
## Running RAREFAN from the commandline
The commandline interface to RAREFAN is implemented in *app/utilities/rarefan*. This script can be used to run RAREFAN on a directory DIR that contains genome sequences and rayt protein fasta files.
The syntax of is
```
$> cd REPIN_ecology/REPIN_ecology
$> gradle build
$> rarefan [-h] [-o OUTDIR] -r REFERENCE [-c MIN_NMER_OCCURRENCE] [-l NMER_LENGTH] -q QUERY_RAYT
[-e E_VALUE_CUTOFF] [-R] [-j THREADS] [-t TREEFILE] [-i]
DIR
```
where the commandline arguments are explained as follows:
```
positional arguments:
DIR Contains the genome DNA sequences and RAYT AA sequences to be analyzed.
optional arguments:
-h, --help show this help message and exit
-o OUTDIR, --outdir OUTDIR
Results will be written to OUTDIR. OUTDIR will be created if not existing
(default: ./rarefan_out).
-r REFERENCE, --reference REFERENCE
Filename of the reference genome sequence
-c MIN_NMER_OCCURRENCE, --min_nmer_occurrence MIN_NMER_OCCURRENCE
Only Nmers of NMER_LENGTH that occur more frequently than MIN_NMER_OCCURRENCE
will be taken into account (default: 55). See RAREFAN manual for details.
-l NMER_LENGTH, --min_nmer_length NMER_LENGTH
Only Nmers of NMER_LENGTH that occur more frequently than MIN_NMER_OCCURRENCE
will be taken into account (default: 21). See RAREFAN manual for details.)
-q QUERY_RAYT, --query_rayt QUERY_RAYT
Filename or path of the amino acid sequence file containing the RAYT protein
sequence (default: None).
-e E_VALUE_CUTOFF, --e_value_cutoff E_VALUE_CUTOFF
e-value cutoff for tblastn of the query rayt sequence against the submitted
genomes (default: 1e-30).
-R, --no-repins Do not analyse REPINS (default: False).
-j THREADS, --num_threads THREADS
Number of threads for parallel cluster analysis with MCL (default: 24).
-t TREEFILE, --treefile TREEFILE
Filename or path of the phylogenetic tree of submitted genomes (newik format,
'.nwk' extension). If none given and more than four genomes are submitted, the
tree will be calculated and written to OUTDIR/tmptree.nwk (default:
tmptree.nwk).
-i, --interactive Interactive mode. Ask for confirmation before starting the analysis run.
```
## Set library path.
Some environment variables (in particular `LD_LIBRARY_PATH`) have to be set
explicitely.
## Runing the RAREFAN web server
### Database backend
The webserver uses MongoDB as a backend. Install mongodb-server, create a database user named 'rarefan', secured by password, and a database 'rarefan'. Assign the 'dbAdmin' role for the database 'rarefan' to the 'rarefan' user. Consult the [mongodb manual](https://docs.mongodb.com/manual/tutorial/manage-users-and-roles/) if unsure how to do this.
### Configuration
Copy the configuration template *app/config_template.py* to *app/config.py* and edit the settings. An example is given below.
Jobs submitted to RAREFAN are processed by redis. In your conda environmont, install `rq` and `redis`.
```shell
$> conda install rq redis
```
source setenv.sh
```python
import os
class Config(object):
SECRET_KEY = 'supersecretkey'
SERVER_NAME = 'localhost:5000'
MONGODB_SETTINGS = {
'db': 'rarefan',
'host': 'localhost',
'port': 27017,
'username': 'rarefan',
'password': 'RaReF@npw01'
}
REDIS_URL = os.environ.get("REDIS_URL") or 'redis://'
MAIL_SERVER = 'mail.my.server.com'
MAIL_USERNAME='rarefan@mail.my.server.com'
MAIL_PASSWORD='mailpass'
MAIL_USE_TLS=True
MAIL_USE_SSL=False
MAIL_PORT=25
MAIL_DEBUG=False
DEFAULT_MAIL_SENDER='rarefan@mail.my.server.com'
```
## Launch the server
To launch the server, run
```
$> rq worker rarefan & # Launch a redis worker.
$> flask run
```
And navigate your browser to localhost:5000 .
And navigate your browser to http://localhost:5000 .
#### NOTE
Data visualisation on a local deployment server is currently not working.
### Server notes (applies mostly to rarefan.evolbio.mpg.de production server)
#### Code updates
After a code update, the server components need to be restarted.
1. Recompile java
```console
$> cd REPINecology/REPINecology
$> gradle build
$> cd -
```
1. Restart python webserver
If the python code is installed in a conda (recommended), it should be activated since the server script depends on the `$CONDA_PREFIX` variable pointing to the environments root directory.
```console
$> sudo service rarefan restart
```
1. Restart R
Deactivate the conda environment
```console
$> conda deactivate
$> sudo service shiny-server restart
```
#### Dependency updates
Sometimes, new code also adds new software dependencies.
1. python
Simply update your conda environment
```console
$> conda env update --file=environment.yml
```
Then restart the webserver as discussed above.
1. R
Deactivate conda env and install new packages as root:
```console
$> conda deactivate
$> sudo -i
#> R
> install.packages("<new package name>")
> exit
#> exit
$>
```
Then restart shiny-server as discussed above.
## Command line interface and testing
## Testing
The directory *test/scripts/* contains two scripts:
### *dl_zenodo.sh*
*dl_zenodo.sh* may be used to download reference datasets from zenodo, and unpack the data into the directory *test/data/datasets/*. Datasets can be downloaded individually or together.
......@@ -130,11 +246,10 @@ Syntax:
* `test_md5`: Computes md5 checksums for all datafiles in the output directory (except subdirectories) and compares to checksums in the directory *test/md5/*.
### Docker
We provide a docker container that packs all dependencies of the java backengine (java code).
### Pull the container
To pull the most recent docker container, run (in a terminal)
```
......
......@@ -12,245 +12,249 @@ import util.*;
import util.phylogenetics.RunTreePrograms;
public class DeterminePopulationFrequencies {
//requires mcl, andi, clustDist and BLAST+
String focalSeeds[];
ArrayList<File> genomes;
File inFolder;
int numMuts=1;
double minFrac=0.01;
//distance from repin to rayt, if within vicinity then repin cluster is associated with that rayt
String legacyBlastPerlLocation;
File queryRAYT;
File genomeFolder;
String e;
boolean analyseREPIN;
File outFolder;
HashMap<String/*genomes*/,HashMap<String/*focal seed*/,Integer/*pop size*/>> results=new HashMap<String,HashMap<String,Integer>>();
public static HashSet<String> fastaExtensions=new HashSet<String>(Arrays.asList("fas","fasta","fna","fastn","fn"));
//requires mcl, andi, clustDist and BLAST+
String focalSeeds[];
ArrayList<File> genomes;
File inFolder;
int numMuts=1;
double minFrac=0.01;
//distance from repin to rayt, if within vicinity then repin cluster is associated with that rayt
String legacyBlastPerlLocation;
File queryRAYT;
File genomeFolder;
String e;
boolean analyseREPIN;
File outFolder;
HashMap<String/*genomes*/,HashMap<String/*focal seed*/,Integer/*pop size*/>> results=new HashMap<String,HashMap<String,Integer>>();
public static HashSet<String> fastaExtensions=new HashSet<String>(Arrays.asList("fas","fasta","fna","fastn","fn"));
int MCLThreads=1;
// Entry point.
public static void main(String args[]) {
// Handle wrong number of arguments.
if(args.length<9) {
System.out.println("Usage: java -jar REPIN_ecology.jar IN_DIR OUT_DIR REFERENCE_STRAIN NMER_OCCURENCE MIN_NMER_LENGTH QUERY_RAYT TREEFILE E_VALUE_CUTOFF ANALYZE_REPINS [PATH_TO_LEGACY_BLAST.PL]");
// Entry point.
public static void main(String args[]) {
// Handle wrong number of arguments.
if(args.length<11 || args.length>12) {
System.out.println("Usage: java -jar REPIN_ecology.jar IN_DIR OUT_DIR REFERENCE_STRAIN NMER_OCCURENCE MIN_NMER_LENGTH QUERY_RAYT TREEFILE E_VALUE_CUTOFF ANALYZE_REPINS MCL_THREADS DISTANCE_GROUP_SEEDS [PATH_TO_LEGACY_BLAST.PL]");
System.exit(1);
}
File inFolder=new File(args[0]);
File outFolder=new File(args[1]);
String focalSeedGenome=args[2];
int minRepFreq=Integer.parseInt(args[3]);
int wordlength=Integer.parseInt(args[4]);
File queryRAYT=new File(args[5]);
File treeFile=new File(args[6]);
String evalue=args[7];
boolean analyseREPIN=args[8].equalsIgnoreCase("true");
File out=new File(outFolder+"/results.txt");
DeterminePopulationFrequencies dpf;
String program="tblastn";
File outFolder=new File(args[1]);
String focalSeedGenome=args[2];
int minRepFreq=Integer.parseInt(args[3]);
int wordlength=Integer.parseInt(args[4]);
File queryRAYT=new File(args[5]);
File treeFile=new File(args[6]);
String evalue=args[7];
boolean analyseREPIN=args[8].equalsIgnoreCase("true");
int MCLThreads=Integer.parseInt(args[9]);
int distanceGroupSeeds=Integer.parseInt(args[10]);
File out=new File(outFolder+"/results.txt");
DeterminePopulationFrequencies dpf;
String program="tblastn";
// legacy_blast path not given.
if(args.length==11) {
dpf=new DeterminePopulationFrequencies(inFolder,outFolder, focalSeedGenome,minRepFreq,wordlength,queryRAYT,program,treeFile,"",evalue,analyseREPIN,MCLThreads,distanceGroupSeeds);
dpf.print(out);
}
String legacyBlastPerlLocation="";
// legacy_blast path given.
else if(args.length==12) {
String legacyBlastPerlLocation=args[11];
dpf=new DeterminePopulationFrequencies(inFolder, outFolder,focalSeedGenome,minRepFreq,wordlength,queryRAYT,program,treeFile,legacyBlastPerlLocation,evalue,analyseREPIN,MCLThreads,distanceGroupSeeds);
dpf.print(out);
if(args.length==12) {
legacyBlastPerlLocation=args[11];
}
dpf=new DeterminePopulationFrequencies(inFolder, outFolder,focalSeedGenome,minRepFreq,wordlength,queryRAYT,program,treeFile,legacyBlastPerlLocation,evalue,analyseREPIN,MCLThreads,distanceGroupSeeds);
dpf.print(out);
}
// Workhorse function.
public DeterminePopulationFrequencies(File inFolder,File outFolder,String focalSeedGenome,int minRepFreq,int wordlength,File queryRAYT,String program,File treeFile,String legacyBlastPerlLocation,String evalue,boolean analyseREPIN,int MCLThreads,int distanceGroupSeeds){
this.inFolder=inFolder;
this.outFolder=outFolder;
this.MCLThreads=MCLThreads;
outFolder.mkdirs();
genomes=getFiles();
this.legacyBlastPerlLocation=legacyBlastPerlLocation;
this.queryRAYT=queryRAYT;
this.focalSeeds=getFocalSeeds(focalSeedGenome,minRepFreq,wordlength,distanceGroupSeeds);
this.genomeFolder=inFolder;
this.analyseREPIN=analyseREPIN;
e=evalue;
calculateResults();
BlastRAYTs.runProgram(inFolder, queryRAYT, outFolder, e, program, getREPtype(), "yafM_relatives.fna",analyseREPIN);
// treeFile=new File(outFolder+"/"+treeFile);
// if(!treeFile.exists()) {
// generateTree(treeFile);
// }
}
private void generateTree(File treeFile) {
System.out.println("Generating Tree.");
String filenames=generateFileNameString();
String treeID=treeFile.getName().split("\\.")[0];
File distFile=new File(outFolder+"/"+treeID+".dist");
System.out.println("Running andi.");
RunTreePrograms.runProgram("andi "+filenames, "", outFolder,distFile);
System.out.println("Running clustDist.");
RunTreePrograms.runProgram("clustDist "+distFile, "", outFolder, treeFile);
}
private String generateFileNameString() {
StringBuffer sb=new StringBuffer();
File[] files=inFolder.listFiles();
for(int i=0;i<files.length;i++) {
if(hasCorrectExtension(files[i])) {
sb.append(" "+files[i]);
}
}
return sb.toString();
}
private String[] getREPtype() {
ArrayList<String> list=new ArrayList<String>();
for(int i=0;i<focalSeeds.length;i++) {
list.add(i+"");
}
return list.toArray(new String[0]);
}
private String[] getFocalSeeds(String genome,int minRepFreq,int wl,int distanceGroupSeeds) {
File fsg=new File(inFolder+"/"+genome);
DetermineFocalSeeds dfs=new DetermineFocalSeeds(fsg,outFolder,minRepFreq,wl,distanceGroupSeeds);
return dfs.getFocalSeeds();
}
public void print(File out) {
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(out));
String[] genomes=results.keySet().toArray(new String[0]);
for(int i=0;i<genomes.length;i++) {
String[] seeds=results.get(genomes[i]).keySet().toArray(new String[0]);
for(int j=0;j<seeds.length;j++) {
bw.write(genomes[i].replace("_", "\t")+"\t"+seeds[j]+"\t"+results.get(genomes[i]).get(seeds[j])+"\n");
}
}
bw.close();
}catch(IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
public static String getGenomeID(File in) {
return in.getName().split("\\.")[0];
}
private void calculateResults() {
System.out.println("Calculating Results.");
REPIN_RAYT_prox rrp=new REPIN_RAYT_prox(this.outFolder,focalSeeds.length);
ArrayList<String> genomeIDs=new ArrayList<String>();
public DeterminePopulationFrequencies(File inFolder,File outFolder,String focalSeedGenome,int minRepFreq,int wordlength,File queryRAYT,String program,File treeFile,String legacyBlastPerlLocation,String evalue,boolean analyseREPIN,int MCLThreads,int distanceGroupSeeds){
this.inFolder=inFolder;
this.outFolder=outFolder;
this.MCLThreads=MCLThreads;
outFolder.mkdirs();
genomes=getFiles();
this.legacyBlastPerlLocation=legacyBlastPerlLocation;
this.queryRAYT=queryRAYT;
this.focalSeeds=getFocalSeeds(focalSeedGenome,minRepFreq,wordlength,distanceGroupSeeds);
this.genomeFolder=inFolder;
this.analyseREPIN=analyseREPIN;
e=evalue;
calculateResults();
BlastRAYTs.runProgram(inFolder, queryRAYT, outFolder, e, program, getREPtype(), "yafM_relatives.fna",analyseREPIN);
// treeFile=new File(outFolder+"/"+treeFile);
// if(!treeFile.exists()) {
// generateTree(treeFile);
// }
}
private void generateTree(File treeFile) {
System.out.println("Generating Tree.");
String filenames=generateFileNameString();
String treeID=treeFile.getName().split("\\.")[0];
File distFile=new File(outFolder+"/"+treeID+".dist");
System.out.println("Running andi.");
RunTreePrograms.runProgram("andi "+filenames, "", outFolder,distFile);
System.out.println("Running clustDist.");
RunTreePrograms.runProgram("clustDist "+distFile, "", outFolder, treeFile);
}
private String generateFileNameString() {
StringBuffer sb=new StringBuffer();
File[] files=inFolder.listFiles();
for(int i=0;i<files.length;i++) {
if(hasCorrectExtension(files[i])) {
sb.append(" "+files[i]);
}
}
return sb.toString();
}
private String[] getREPtype() {
ArrayList<String> list=new ArrayList<String>();
for(int i=0;i<focalSeeds.length;i++) {
list.add(i+"");
}
return list.toArray(new String[0]);
}
private String[] getFocalSeeds(String genome,int minRepFreq,int wl,int distanceGroupSeeds) {
File fsg=new File(inFolder+"/"+genome);
DetermineFocalSeeds dfs=new DetermineFocalSeeds(fsg,outFolder,minRepFreq,wl,distanceGroupSeeds);
return dfs.getFocalSeeds();
}
public void print(File out) {
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(out));
String[] genomes=results.keySet().toArray(new String[0]);
for(int i=0;i<genomes.length;i++) {
String[] seeds=results.get(genomes[i]).keySet().toArray(new String[0]);
for(int j=0;j<seeds.length;j++) {
bw.write(genomes[i].replace("_", "\t")+"\t"+seeds[j]+"\t"+results.get(genomes[i]).get(seeds[j])+"\n");
}
}
bw.close();
}catch(IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
public static String getGenomeID(File in) {
return in.getName().split("\\.")[0];
}
private void calculateResults() {
System.out.println("Calculating Results.");
REPIN_RAYT_prox rrp=new REPIN_RAYT_prox(this.outFolder,focalSeeds.length);
ArrayList<String> genomeIDs=new ArrayList<String>();
// TODO: Can we parallelize this loop?
for(int i=0;i<genomes.size();i++) {
String onlyGenome=getGenomeID(genomes.get(i));
for(int i=0;i<genomes.size();i++) {
String onlyGenome=getGenomeID(genomes.get(i));
// parallelize?
genomeIDs.add(onlyGenome);
ArrayList<REPINGenomePositions> rgp=new ArrayList<REPINGenomePositions>();
ArrayList<Info> raytPos=writeRAYTLocation(genomes.get(i));
for(int j=0;j<focalSeeds.length;j++) {
String genomeID=onlyGenome+"_"+j;
results.put(genomeID, new HashMap<String,Integer>());
File outFolder=new File(this.outFolder+"/"+genomeID+"/");
outFolder.mkdir();
int wl=focalSeeds[j].length();
REPINProperties rp=new REPINProperties(outFolder,genomeID,genomes.get(i),wl,numMuts,minFrac,null,focalSeeds[j],false,analyseREPIN,MCLThreads);
System.out.println("Write REPINs as artemis files for "+genomeID+"...");
writeREPINArtemis(new File(outFolder+"/"+genomeID+"_largestCluster.ss"),j);
writeREPINArtemis(new File(outFolder+"/"+genomeID+".ss"),j);
File cluster;
int k=0;
while((cluster=new File(outFolder+"/"+genomeID+"_"+k+".ss")).exists()){