├── VERSION
├── transposon
    ├── utils.py
    ├── test_utils.py
    ├── import_filtered_genes.py
    ├── import_filtered_TEs.py
    ├── worker.py
    ├── notes
    ├── replace_names.py
    ├── __init__.py
    ├── gene_data.py
    └── verify_cache.py
├── examples
    ├── Human
    │   ├── list_of_chr13_genes.txt
    │   ├── list_of_chr7_genes.txt
    │   ├── src
    │   │   ├── sync_remote_to_local.sh
    │   │   ├── sync_local_to_remote.sh
    │   │   ├── TE_Density_Human.sb
    │   │   ├── import_human_te_anno.py
    │   │   ├── retrieve_info_of_genes.py
    │   │   ├── import_human_gene_anno.py
    │   │   └── replace_human_TE_names.py
    │   ├── README.md
    │   └── Makefile
    ├── Rice_Synteny
    │   ├── src
    │   │   ├── .gitignore
    │   │   ├── sync_remote_to_local.sh
    │   │   ├── sync_local_to_remote.sh
    │   │   ├── Annotate_EDTA_Rice_Sativa.sb
    │   │   ├── Annotate_EDTA_Rice_Glaberrima.sb
    │   │   ├── TE_Density_Sativa.sb
    │   │   ├── TE_Density_Glaberrima.sb
    │   │   ├── generate_pairs.py
    │   │   ├── bargraphs.py
    │   │   ├── fix_cds_names.py
    │   │   ├── replace_names_rice.py
    │   │   ├── import_syntelogs.py
    │   │   ├── fix_fasta_names.py
    │   │   ├── import_rice_EDTA.py
    │   │   ├── find_abnormal_genes.py
    │   │   └── import_rice_gene_anno.py
    │   ├── README.md
    │   └── Makefile
    ├── Arabidopsis
    │   ├── .gitignore
    │   ├── src
    │   │   ├── sync_remote_to_local.sh
    │   │   ├── sync_local_to_remote.sh
    │   │   ├── Annotate_EDTA_Arabidopsis_thaliana.sb
    │   │   ├── TE_Density_Arabidopsis.sb
    │   │   ├── replace_names_Arabidopsis.py
    │   │   ├── import_Arabidopsis_EDTA.py
    │   │   └── import_Arabidopsis_gene_anno.py
    │   └── Makefile
    ├── Blueberry_Expression
    │   ├── src
    │   │   ├── sync_remote_to_local.sh
    │   │   ├── sync_local_to_remote.sh
    │   │   ├── Annotate_TE_Blueberry_EDTA.sb
    │   │   ├── TE_Density_Blueberry.sb
    │   │   ├── import_blueberry_gene_anno.py
    │   │   ├── replace_names_blueberry.py
    │   │   └── import_blueberry_EDTA.py
    │   ├── Makefile
    │   └── EDTA_Blueberry.out
    ├── README.md
    └── general_read_density_data.py
├── .flake8
├── Conditions_1.jpg
├── Conditions_2.jpg
├── .vimrc
├── config
    ├── test_run_config.ini
    └── production_run_config.ini
├── tests
    ├── unit
    │   ├── pytest.ini
    │   ├── test_MergeWorker.py
    │   ├── test_OverlapManager.py
    │   ├── test_data.py
    │   ├── test_gene_data.py
    │   ├── test_transposon_data.py
    │   ├── test_WorkerProcess.py
    │   ├── test_preprocess.py
    │   ├── test_import_genes.py
    │   ├── test_OverlapData.py
    │   └── test_DensityData.py
    ├── input_data
    │   ├── Test_Genes_MergeData.tsv
    │   ├── Test_SingleC_ConcurrentOverlap_Nameless_Revision.tsv
    │   ├── Test_SingleC_ConcurrentOverlap_Order_Revision.tsv
    │   ├── Test_SingleC_ConcurrentOverlap_Superfam_Revision.tsv
    │   ├── Test_TEs_MergeData.tsv
    │   ├── Test_Genes_DensityData.tsv
    │   ├── Test_SingleC_SingleElongate_Nameless_Revision.tsv
    │   ├── Test_SingleC_SingleElongate_Order_Revision.tsv
    │   ├── Test_Preprocess_Cleaned_TEs_Unequal_Number_Chrom.tsv
    │   ├── Test_Preprocess_Cleaned_TEs_Unequal_Chrom_IDs.tsv
    │   ├── Test_Gene_Anno_Float_Conversion.tsv
    │   ├── Test_SingleC_MultiElongate_Order_Revision.tsv
    │   ├── Test_SingleC_MultiElongate_Nameless_Revision.tsv
    │   ├── Test_Preprocess_Cleaned_Genes.tsv
    │   ├── Test_SingleC_MultiElongate_Superfam_Revision.tsv
    │   ├── Test_SingleC_SingleElongate_Superfam_Revision.tsv
    │   └── Test_Genes_NormMatrix.tsv
    ├── test_transposon.tsv
    └── test_transposon_single_chrom.tsv
├── RELEASE
├── .gitignore
├── setup.py
├── requirements.txt
├── CHANGELOG
├── DESIGN
└── Makefile


/VERSION:
--------------------------------------------------------------------------------
1 | 2.1.3
2 | 


--------------------------------------------------------------------------------
/transposon/utils.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/Human/list_of_chr13_genes.txt:
--------------------------------------------------------------------------------
1 | BRCA2
2 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.out
2 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 89
3 | exclude = .git,__pycache__
4 | 


--------------------------------------------------------------------------------
/examples/Human/list_of_chr7_genes.txt:
--------------------------------------------------------------------------------
1 | GUSB
2 | CFTR
3 | PMS2
4 | NT5C3A
5 | 


--------------------------------------------------------------------------------
/Conditions_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjteresi/TE_Density/HEAD/Conditions_1.jpg


--------------------------------------------------------------------------------
/Conditions_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjteresi/TE_Density/HEAD/Conditions_2.jpg


--------------------------------------------------------------------------------
/.vimrc:
--------------------------------------------------------------------------------
1 | let HIGHLIGHT_COLS=90                        "overwrite this in local .vimrc for per project setting
2 | 


--------------------------------------------------------------------------------
/config/test_run_config.ini:
--------------------------------------------------------------------------------
1 | [density_parameters]
2 | first_window_size = 400
3 | window_delta = 200
4 | last_window_size = 400
5 | 


--------------------------------------------------------------------------------
/config/production_run_config.ini:
--------------------------------------------------------------------------------
1 | [density_parameters]
2 | first_window_size = 500
3 | window_delta = 500
4 | last_window_size = 10000
5 | 


--------------------------------------------------------------------------------
/examples/Arabidopsis/.gitignore:
--------------------------------------------------------------------------------
1 | results/tables/**
2 | results/graphs/**
3 | results/chi_squared/**
4 | results/arrays_for_pat_1KB_up_down_11_29_2021.tar.gz
5 | 


--------------------------------------------------------------------------------
/tests/unit/pytest.ini:
--------------------------------------------------------------------------------
1 | # pytest.ini
2 | [pytest]
3 | filterwarnings= default
4 |                 ignore:.*is deprecated.*:Warning
5 |                 error::DeprecationWarning:importlib.*
6 | 


--------------------------------------------------------------------------------
/examples/Human/src/sync_remote_to_local.sh:
--------------------------------------------------------------------------------
1 | rsync -ave ssh teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Human/ /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Human/
2 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/sync_remote_to_local.sh:
--------------------------------------------------------------------------------
1 | rsync -ave ssh teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/ /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Rice/
2 | 


--------------------------------------------------------------------------------
/RELEASE:
--------------------------------------------------------------------------------
1 | 2.0.0
2 | - Upgrade h5py 2.10 --> 3.7, resulted in incompatibility w/ output H5 files
3 | 	+ string variables now require explicit de/encode
4 | 	+ reading previous output files not supported
5 | 
6 | 1.0.0
7 | - initial release
8 | 


--------------------------------------------------------------------------------
/examples/Arabidopsis/src/sync_remote_to_local.sh:
--------------------------------------------------------------------------------
1 | rsync -ave ssh teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Arabidopsis/ /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Arabidopsis/
2 | 


--------------------------------------------------------------------------------
/examples/Human/src/sync_local_to_remote.sh:
--------------------------------------------------------------------------------
1 | rsync -h -v -r -P -t -e ssh /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Human --chmod=Dg+s teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/
2 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_Genes_MergeData.tsv:
--------------------------------------------------------------------------------
1 | Gene_Name	Chromosome	Feature	Start	Stop	Strand	Length
2 | dummy_gene_1	Fvb1-1	gene	100.0	2300.0	+	2201.0
3 | dummy_gene_2	Fvb1-1	gene	6000.0	7900.0	-	1901.0
4 | dummy_gene_3	Fvb1-1	gene	8000.0	8500.0	-	501.0
5 | 


--------------------------------------------------------------------------------
/examples/Blueberry_Expression/src/sync_remote_to_local.sh:
--------------------------------------------------------------------------------
1 | rsync -ave ssh teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Blueberry/ /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Blueberry/
2 | 
3 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/sync_local_to_remote.sh:
--------------------------------------------------------------------------------
1 | rsync -h -v -r -P -t -e ssh /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Rice --chmod=Dg+s teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/
2 | 


--------------------------------------------------------------------------------
/examples/Arabidopsis/src/sync_local_to_remote.sh:
--------------------------------------------------------------------------------
1 | rsync -h -v -r -P -t -e ssh /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Arabidopsis --chmod=Dg+s teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Arabidopsis
2 | 


--------------------------------------------------------------------------------
/examples/Blueberry_Expression/src/sync_local_to_remote.sh:
--------------------------------------------------------------------------------
1 | rsync -h -v -r -P -t -e ssh /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Blueberry --chmod=Dg+s teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Blueberry/
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/*.swp
 2 | **/*.swo
 3 | **/*.orig
 4 | **/*.pyc
 5 | tmp
 6 | tags
 7 | cumtime
 8 | .vscode
 9 | tests/output_data/*
10 | tests/test_h5_cache_loc/*
11 | tests/input_data/test_swap_file.h5
12 | tests/input_data/test_swap_file_SenseSwapped.HDF5
13 | 
14 | /filtered_input_data/revised_input_data/*
15 | !/filtered_input_data/revised_input_data/.gitkeep
16 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_SingleC_ConcurrentOverlap_Nameless_Revision.tsv:
--------------------------------------------------------------------------------
1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
2 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
3 | Fvb1-1	3350.0	3957.0	-	LTR	Gypsy	608.0
4 | Fvb1-1	4209.0	4761.0	-	LTR	Gypsy	553.0
5 | Fvb1-1	4761.0	5000.0	-	DNA	TIR	239.0
6 | Fvb1-1	12100.0	12300.0	-	TIR	hAT	201.0
7 | Fvb1-1	12150.0	12200.0	+	DNA	MITE	51.0
8 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_SingleC_ConcurrentOverlap_Order_Revision.tsv:
--------------------------------------------------------------------------------
1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
2 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
3 | Fvb1-1	3350.0	3957.0	-	LTR	Gypsy	608.0
4 | Fvb1-1	4209.0	4761.0	-	LTR	Gypsy	553.0
5 | Fvb1-1	4761.0	5000.0	-	LTR	Copia	239.0
6 | Fvb1-1	12100.0	12300.0	-	TIR	hAT	201.0
7 | Fvb1-1	12150.0	12200.0	+	TIR	hAT	51.0
8 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_SingleC_ConcurrentOverlap_Superfam_Revision.tsv:
--------------------------------------------------------------------------------
1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
2 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
3 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
4 | Fvb1-1	4209.0	4761.0	-	LTR	Gypsy	553.0
5 | Fvb1-1	4761.0	5000.0	-	LTR	Gypsy	239.0
6 | Fvb1-1	12100.0	12300.0	-	TIR	hAT	201.0
7 | Fvb1-1	12150.0	12200.0	+	TIR	hAT	51.0
8 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_TEs_MergeData.tsv:
--------------------------------------------------------------------------------
1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
2 | Fvb1-1	3350	4100	-	Total_TE_Density	Total_TE_Density	751
3 | Fvb1-1	4870	5229	-	Total_TE_Density	Total_TE_Density	360
4 | Fvb1-1	8459	9370	-	Total_TE_Density	Total_TE_Density	912
5 | Fvb1-1	9556	9677	+	Total_TE_Density	Total_TE_Density	122
6 | Fvb1-1	9678	11249	-	Total_TE_Density	Total_TE_Density	1572
7 | Fvb1-1	11250	11469	+	Total_TE_Density	Total_TE_Density	220
8 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_Genes_DensityData.tsv:
--------------------------------------------------------------------------------
 1 | Gene_Name	Chromosome	Feature	Start	Stop	Strand	Length
 2 | dummy1	Fvb1-1	gene	41.0	2396.0	+	2356.0
 3 | dummy2	Fvb1-1	gene	5556.0	7978.0	-	2423.0
 4 | dummy3	Fvb1-1	gene	8487.0	8797.0	-	311.0
 5 | dummy4	Fvb1-1	gene	9361.0	9658.0	+	298.0
 6 | dummy5	Fvb1-1	gene	10000.0	12000.0	+	2001.0
 7 | dummy6	Fvb1-1	gene	11000.0	12000.0	+	1001.0
 8 | dummy7	Fvb1-1	gene	12000.0	13000.0	+	1001.0
 9 | dummy8	Fvb1-1	gene	13000.0	14000.0	+	1001.0
10 | 


--------------------------------------------------------------------------------
/examples/Human/README.md:
--------------------------------------------------------------------------------
1 | # Genome:
2 | 
3 | I acquired the human genome gene annotation through [CoGe](https://genomevolution.org/coge/GenomeInfo.pl?gid=25747). And I acquired a TE annotation through the [UCSC Genome Browser](https://genome-euro.ucsc.edu/cgi-bin/hgTables) using the options groups: repeats, track: RepeatMasker, and output format: all fields. I had to do some reformatting for this TE annotation to get it to be compliant with TE Density and those scripts are present in: xyz (**CITE**))
4 | 
5 | Reformatting of human gene annotation file, *gencode.v37.annotation.gff3*, use commands *python examples/Human/import_scripts/import_human_gene_anno.py* on the gene file.
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(
 5 |     name="te-density",
 6 |     version="2.1.1",
 7 |     description="Calculates Transposable Element density",
 8 |     url="https://github.com/sjteresi/TE_Density",
 9 |     packages=find_packages(),
10 |     author="Scott Teresi, Michael Teresi",
11 |     license = "GPL-3.0",
12 |     install_requires=[
13 |         'coloredlogs>=15.0',
14 |         'h5py>=3.7',
15 |         'matplotlib>=3.6',
16 |         'numpy>=1.23',
17 |         'numexpr>=2.8.3',
18 |         'pandas>=1.5',
19 |         'scipy>=1.9',
20 |         'tqdm>=4.64',
21 |     ],
22 |     scripts=[
23 |         './process_genome.py'
24 |     ],
25 | )
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | attrs==22.1.0
 2 | black==23.3.0
 3 | click==8.1.3
 4 | coloredlogs==15.0.1
 5 | contourpy==1.0.6
 6 | cycler==0.11.0
 7 | exceptiongroup==1.0.4
 8 | fonttools==4.38.0
 9 | h5py==3.8.0
10 | humanfriendly==10.0
11 | iniconfig==1.1.1
12 | kiwisolver==1.4.4
13 | matplotlib==3.6.2
14 | mypy-extensions==0.4.3
15 | numexpr==2.8.4
16 | numpy==1.24.2
17 | packaging==23.1
18 | pandas==1.5.2
19 | pathspec==0.10.2
20 | Pillow==9.3.0
21 | platformdirs==2.5.4
22 | pluggy==1.0.0
23 | pyparsing==3.0.9
24 | pytest==7.3.1
25 | python-dateutil==2.8.2
26 | pytz==2022.6
27 | scipy==1.9.3
28 | six==1.16.0
29 | tables==3.7.0
30 | tomli==2.0.1
31 | tqdm==4.65.0
32 | typing-extensions==3.7.4.3
33 | wcwidth==0.1.7
34 | wrapt==1.11.2
35 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | 2.1.3
 2 | - fleshout import_filtere_genes exception message
 3 | 
 4 | 2.1.2
 5 | - refactoring, add _DensitySubset
 6 | 
 7 | 2.1.1
 8 | - add initial setup.py
 9 | - remove tables (pytables)
10 | - upgrade numpy  1.23.5 --> 1.24.2
11 | - upgrade h5py   3.7.0  --> 3.8.0
12 | - test on cpython 3.11.3+
13 | 
14 | 2.1.0
15 | - upgrade py3.8 --> py3.10
16 | - upgrade dependencies
17 | 
18 | 2.0.0
19 | - upgrade h5py 2.10 --> 3.7
20 | 
21 | 1.0.1
22 | - use default h5py.File raw data chunk cache size (rdcc_nbytes),
23 |   which may reduce ram usage in some cases
24 | 
25 | 1.0.0
26 | - upgrade numpy  1.20.2 --> 1.23.3
27 | - upgrade pandas 1.0.5  --> 1.4.4
28 | - upgrade scipy  1.6.3  --> 1.9.1
29 | - add note on python3.8 / dependencies
30 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_SingleC_SingleElongate_Nameless_Revision.tsv:
--------------------------------------------------------------------------------
 1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
 2 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
 3 | Fvb1-1	3795.0	4133.0	-	DNA	Hat	339.0
 4 | Fvb1-1	4209.0	4761.0	-	LTR	Copia	553.0
 5 | Fvb1-1	4670.0	5229.0	-	LTR	Gypsy	560.0
 6 | Fvb1-1	8459.0	9266.0	-	Completely_Unknown	Completely_Unknown	808.0
 7 | Fvb1-1	8794.0	9370.0	+	Completely_Unknown	Completely_Unknown	577.0
 8 | Fvb1-1	9556.0	9677.0	+	Completely_Unknown	Completely_Unknown	122.0
 9 | Fvb1-1	9675.0	11249.0	-	Completely_Unknown	Completely_Unknown	1575.0
10 | Fvb1-1	23456.0	24419.0	-	TIR	CACTA	963.0
11 | Fvb1-1	24415.0	24625.0	-	LTR	Copia	211.0
12 | Fvb1-1	27000.0	27500.0	-	DNA	MULE	501.0
13 | Fvb1-1	27200.0	27550.0	-	LTR	Copia	351.0
14 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_SingleC_SingleElongate_Order_Revision.tsv:
--------------------------------------------------------------------------------
 1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
 2 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
 3 | Fvb1-1	3795.0	4133.0	-	LTR	Gypsy	339.0
 4 | Fvb1-1	4209.0	4761.0	-	LTR	Copia	553.0
 5 | Fvb1-1	4670.0	5229.0	-	LTR	Gypsy	560.0
 6 | Fvb1-1	8459.0	9266.0	-	Completely_Unknown	Completely_Unknown	808.0
 7 | Fvb1-1	8794.0	9370.0	+	Completely_Unknown	Completely_Unknown	577.0
 8 | Fvb1-1	9556.0	9677.0	+	Completely_Unknown	Completely_Unknown	122.0
 9 | Fvb1-1	9675.0	11249.0	-	Completely_Unknown	Completely_Unknown	1575.0
10 | Fvb1-1	23456.0	24419.0	-	TIR	CACTA	963.0
11 | Fvb1-1	24415.0	24625.0	-	TIR	Test	211.0
12 | Fvb1-1	24000.0	24500.0	-	DNA	MULE	501.0
13 | Fvb1-1	24200.0	24550.0	-	LTR	Copia	351.0
14 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_Preprocess_Cleaned_TEs_Unequal_Number_Chrom.tsv:
--------------------------------------------------------------------------------
 1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
 2 | VaccDscaff1	280.0	369.0	+	LTR	Unknown_LTR_Superfam	90.0
 3 | VaccDscaff1	396.0	494.0	-	TIR	Tc1-Mariner	99.0
 4 | VaccDscaff1	495.0	783.0	+	TIR	Tc1-Mariner	289.0
 5 | VaccDscaff1	3197.0	3361.0	+	TIR	hAT	165.0
 6 | VaccDscaff1	3722.0	3847.0	+	LTR	Copia	126.0
 7 | VaccDscaff1	3749.0	3981.0	-	TIR	Tc1-Mariner	233.0
 8 | VaccDscaff1	4349.0	4509.0	-	Helitron	Helitron	161.0
 9 | VaccDscaff1	4451.0	4663.0	+	Helitron	Helitron	213.0
10 | VaccDscaff1	4591.0	4681.0	+	TIR	Mutator	91.0
11 | VaccDscaff2	4884.0	5172.0	-	LTR	Unknown_LTR_Superfam	289.0
12 | VaccDscaff2	7111.0	7267.0	-	Helitron	Helitron	157.0
13 | VaccDscaff2	7174.0	7288.0	+	TIR	PIF-Harbinger	115.0
14 | VaccDscaff2	7209.0	7400.0	+	Helitron	Helitron	192.0
15 | VaccDscaff2	10667.0	10833.0	+	LTR	Copia	167.0
16 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_Preprocess_Cleaned_TEs_Unequal_Chrom_IDs.tsv:
--------------------------------------------------------------------------------
 1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
 2 | VaccDscaff24	280.0	369.0	+	LTR	Unknown_LTR_Superfam	90.0
 3 | VaccDscaff24	396.0	494.0	-	TIR	Tc1-Mariner	99.0
 4 | VaccDscaff24	495.0	783.0	+	TIR	Tc1-Mariner	289.0
 5 | VaccDscaff24	3197.0	3361.0	+	TIR	hAT	165.0
 6 | VaccDscaff24	3722.0	3847.0	+	LTR	Copia	126.0
 7 | VaccDscaff24	3749.0	3981.0	-	TIR	Tc1-Mariner	233.0
 8 | VaccDscaff24	4349.0	4509.0	-	Helitron	Helitron	161.0
 9 | VaccDscaff24	4451.0	4663.0	+	Helitron	Helitron	213.0
10 | VaccDscaff24	4591.0	4681.0	+	TIR	Mutator	91.0
11 | VaccDscaff24	4884.0	5172.0	-	LTR	Unknown_LTR_Superfam	289.0
12 | VaccDscaff24	7111.0	7267.0	-	Helitron	Helitron	157.0
13 | VaccDscaff24	7174.0	7288.0	+	TIR	PIF-Harbinger	115.0
14 | VaccDscaff24	7209.0	7400.0	+	Helitron	Helitron	192.0
15 | VaccDscaff24	10667.0	10833.0	+	LTR	Copia	167.0
16 | 


--------------------------------------------------------------------------------
/DESIGN:
--------------------------------------------------------------------------------
 1 | # Densities
 2 | 
 3 | 
 4 | # Pipeline
 5 | Design for processing the gene / transposon files.
 6 | 
 7 | Input gene / transposon files, output density files.
 8 | Calculate the multiple TE densities for each gene, with a sweep of window values.
 9 | 
10 | ## Steps
11 | The pipeline is a generic split / apply / combine.
12 | Each density is independent, however, the densities are accumulated (summed) at the end.
13 | 
14 | 
15 | 0. Input gene / TE files; input window length
16 | 1. Preprocess
17 |   - chunkify gene / TE based on chromosome
18 |   - list sub-gene / sub-TE pairs
19 | 2. Split wrt gene names / Merge wrt TE overlap
20 |   - for each sub-gene / sub-TE pair
21 |     - for each window
22 |       - start workers
23 |       - request overlap for each gene name (handling failed requests!)
24 |       - merge worker results (sums of overlaps )
25 |       - calculate densities, write to file
26 | 


--------------------------------------------------------------------------------
/tests/unit/test_MergeWorker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Unit test MergeWorker.
 5 | """
 6 | 
 7 | __author__ = "Michael Teresi"
 8 | 
 9 | import logging
10 | import os
11 | import pytest
12 | import tempfile
13 | 
14 | import coloredlogs
15 | import numpy as np
16 | import pandas as pd
17 | 
18 | from transposon.merge_data import MergeData
19 | from transposon.merge_data import _MergeConfigSink, _MergeConfigSource, _SummationArgs
20 | from transposon.transposon_data import TransposonData
21 | from transposon.gene_data import GeneData
22 | from transposon.overlap import OverlapData, OverlapWorker
23 | 
24 | @pytest.fixture()
25 | def temp_dir():
26 |     """Temporary directory."""
27 | 
28 |     with tempfile.TemporaryDirectory() as dir:
29 |         yield dir
30 | 
31 | @pytest.fixture(scope="module")
32 | def overlap_data():
33 |     pass
34 |     # scope=module b/c we can reuse the files b/c they are read only
35 | 


--------------------------------------------------------------------------------
/examples/Arabidopsis/src/Annotate_EDTA_Arabidopsis_thaliana.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -login
 2 | 
 3 | #SBATCH -J EDTA_Arabidopsis
 4 | #SBATCH --time=100:00:00
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=25
 7 | #SBATCH --mem-per-cpu=30G
 8 | #SBATCH -o EDTA_Arabidopsis.out
 9 | #--------------------------------------------------------
10 | # NOTE the user should change these paths to match their machine
11 | EDTA_DIR=/mnt/research/edgerpat_lab/EDTA
12 | GENOME_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Arabidopsis/Sequences
13 | OUT_DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Arabidopsis/TEs
14 | 
15 | 
16 | # NOTE
17 | # Do these commands ahead of trying to run EDTA to properly install
18 | # git clone https://github.com/oushujun/EDTA.git
19 | # cd EDTA
20 | # conda env create -f EDTA.yml
21 | 
22 | module purge
23 | module load Conda/3
24 | conda activate EDTA  # activate the conda environment of packages
25 | 
26 | cd $OUT_DATA_DIR  # cd to output data dir for any extraneous files that get outputted
27 | 
28 | # Run EDTA
29 | perl $EDTA_DIR/EDTA.pl --genome $GENOME_DIR/TAIR10_chr_main_chromosomes.fas --cds $GENOME_DIR/Arabidopsis_CDS.fasta --sensitive 1 --anno 1 --threads 25
30 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/Annotate_EDTA_Rice_Sativa.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -login
 2 | 
 3 | #SBATCH -J EDTA_Sativa
 4 | #SBATCH --time=100:00:00
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=25
 7 | #SBATCH --mem-per-cpu=30G
 8 | #SBATCH -o EDTA_Sativa.out
 9 | #--------------------------------------------------------
10 | # NOTE the user should change these paths to match their machine
11 | EDTA_DIR=/mnt/research/edgerpat_lab/EDTA
12 | GENOME_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/Oryza_Sativa/Sequences
13 | OUT_DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/Oryza_Sativa/TEs
14 | 
15 | 
16 | # NOTE
17 | # Do these commands ahead of trying to run EDTA to properly install
18 | # git clone https://github.com/oushujun/EDTA.git
19 | # cd EDTA
20 | # conda env create -f EDTA.yml
21 | 
22 | module purge
23 | module load Conda/3
24 | conda activate EDTA  # activate the conda environment of packages
25 | 
26 | cd $OUT_DATA_DIR  # cd to output data dir for any extraneous files that get outputted
27 | 
28 | # Run EDTA
29 | perl $EDTA_DIR/EDTA.pl --genome $GENOME_DIR/Oryza_Sativa_NewNames.fasta --cds $GENOME_DIR/Oryza_Sativa_CDS_NewNames.fasta --sensitive 1 --anno 1 --threads 25
30 | 


--------------------------------------------------------------------------------
/examples/Blueberry_Expression/src/Annotate_TE_Blueberry_EDTA.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -login
 2 | 
 3 | #SBATCH -J EDTA_Blueberry
 4 | #SBATCH --time=167:00:00
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=25
 7 | #SBATCH --mem-per-cpu=30G
 8 | #SBATCH -o EDTA_Blueberry.out
 9 | #SBATCH -e EDTA_Blueberry.err
10 | #--------------------------------------------------------
11 | # NOTE the user should change these paths to match their machine
12 | EDTA_DIR=/mnt/research/edgerpat_lab/EDTA
13 | GENOME_DIR=/mnt/research/edgerpat_lab/Scotty/Blueberry_Data/Genome
14 | OUT_DATA_DIR=/mnt/research/edgerpat_lab/Scotty/Blueberry_Data/Blueberry_TE_Density/Annotation
15 | 
16 | 
17 | # NOTE
18 | # Do these commands ahead of trying to run EDTA to properly install
19 | # git clone https://github.com/oushujun/EDTA.git
20 | # cd EDTA
21 | # conda env create -f EDTA.yml
22 | 
23 | module purge
24 | module load Conda/3
25 | conda activate EDTA  # activate the conda environment of packages
26 | 
27 | cd $OUT_DATA_DIR  # cd to output data dir for any extraneous files that get outputted
28 | 
29 | # Run EDTA
30 | perl $EDTA_DIR/EDTA.pl --genome $GENOME_DIR/Vaccinium_corymbosum.faa --cds $GENOME_DIR/Vacc_c_CoGe_CDS.fasta --sensitive 1 --anno 1 --threads 25
31 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_Gene_Anno_Float_Conversion.tsv:
--------------------------------------------------------------------------------
 1 | Fvb1-1	maker	gene	41	2396	.	+	.	ID=maker-Fvb1-1-snap-gene-0.15;Name=maker-Fvb1-1-snap-gene-0.15
 2 | Fvb1-1	maker	gene	5556	7978	.	-	.	ID=maker-Fvb1-1-augustus-gene-0.13;Name=maker-Fvb1-1-augustus-gene-0.13
 3 | Fvb1-1	maker	gene	8487	8797	.	-	.	ID=maker-Fvb1-1-snap-gene-0.18;Name=maker-Fvb1-1-snap-gene-0.18
 4 | Fvb1-1	maker	gene	9361	9658	.	+	.	ID=snap_masked-Fvb1-1-processed-gene-0.6;Name=snap_masked-Fvb1-1-processed-gene-0.6
 5 | Fvb1-1	maker	gene	11127	11411	.	-	.	ID=augustus_masked-Fvb1-1-processed-gene-0.4;Name=augustus_masked-Fvb1-1-processed-gene-0.4
 6 | Fvb1-1	maker	gene	84598	86703	.	+	.	ID=maker-Fvb1-1-snap-gene-0.16;Name=maker-Fvb1-1-snap-gene-0.16
 7 | Fvb1-1	maker	gene	117287120	117715971	.	+	.	ID=maker-Fvb1-1-augustus-gene-3.17;Name=maker-Fvb1-1-augustus-gene-3.17
 8 | Fvb1-1	maker	gene	118974314397	118974317655	.	-	.	ID=maker-Fvb1-1-augustus-gene-3.19;Name=maker-Fvb1-1-augustus-gene-3.19
 9 | Fvb1-1	maker	gene	22456307315831	22456307317608	.	+	.	ID=maker-Fvb1-1-snap-gene-3.20;Name=maker-Fvb1-1-snap-gene-3.20
10 | Fvb1-1	maker	gene	88877765432319026	88877765432320584	.	+	.	ID=augustus_masked-Fvb1-1-processed-gene-3.0;Name=augustus_masked-Fvb1-1-processed-gene-3.0
11 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_SingleC_MultiElongate_Order_Revision.tsv:
--------------------------------------------------------------------------------
 1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
 2 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
 3 | Fvb1-1	3795.0	4215.0	-	LTR	Copia	420.0
 4 | Fvb1-1	4209.0	4761.0	-	LTR	Copia	553.0
 5 | Fvb1-1	4670.0	5229.0	-	LTR	Gypsy	360.0
 6 | Fvb1-1	8459.0	9266.0	-	Completely_Unknown	Completely_Unknown	808.0
 7 | Fvb1-1	8794.0	9370.0	+	Completely_Unknown	Completely_Unknown	577.0
 8 | Fvb1-1	9256.0	9677.0	+	Completely_Unknown	Completely_Unknown	421.0
 9 | Fvb1-1	9700.0	10000.0	-	TIR	hAT	301.0
10 | Fvb1-1	9678.0	11249.0	-	Completely_Unknown	Completely_Unknown	1572.0
11 | Fvb1-1	11248.0	11469.0	+	Completely_Unknown	Completely_Unknown	220.0
12 | Fvb1-1	11450.0	11886.0	+	Completely_Unknown	Completely_Unknown	298.0
13 | Fvb1-1	12193.0	12404.0	-	TIR	hAT	212.0
14 | Fvb1-1	13625.0	13774.0	+	Completely_Unknown	Completely_Unknown	150.0
15 | Fvb1-1	13799.0	13892.0	-	TIR	Mutator	94.0
16 | Fvb1-1	13892.0	14156.0	+	Helitron	Helitron	265.0
17 | Fvb1-1	14154.0	15323.0	+	Helitron	Helitron	1170.0
18 | Fvb1-1	15320.0	16343.0	-	Helitron	Helitron	1114.0
19 | Fvb1-1	20236.0	20466.0	-	LTR	Copia	231.0
20 | Fvb1-1	20465.0	20899.0	+	LTR	Unknown_LTR_Superfam	435.0
21 | Fvb1-1	20880.0	22846.0	-	LTR	Gypsy	1966.0
22 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_SingleC_MultiElongate_Nameless_Revision.tsv:
--------------------------------------------------------------------------------
 1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
 2 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
 3 | Fvb1-1	3795.0	4215.0	-	LTR	Copia	420.0
 4 | Fvb1-1	4209.0	4761.0	-	LTR	Copia	553.0
 5 | Fvb1-1	4670.0	5229.0	-	LTR	Gypsy	360.0
 6 | Fvb1-1	8459.0	9266.0	-	Completely_Unknown	Completely_Unknown	808.0
 7 | Fvb1-1	8794.0	9370.0	+	Completely_Unknown	Completely_Unknown	577.0
 8 | Fvb1-1	9256.0	9677.0	+	Completely_Unknown	Completely_Unknown	421.0
 9 | Fvb1-1	9678.0	11249.0	-	Completely_Unknown	Completely_Unknown	1572.0
10 | Fvb1-1	9700.0	10000.0	-	TIR	hAT	301.0
11 | Fvb1-1	11248.0	11469.0	+	Completely_Unknown	Completely_Unknown	220.0
12 | Fvb1-1	11450.0	11886.0	+	Completely_Unknown	Completely_Unknown	298.0
13 | Fvb1-1	12193.0	12404.0	-	TIR	hAT	212.0
14 | Fvb1-1	13625.0	13774.0	+	Completely_Unknown	Completely_Unknown	150.0
15 | Fvb1-1	13799.0	13892.0	-	TIR	Mutator	94.0
16 | Fvb1-1	13890.0	14156.0	+	Helitron	Helitron	265.0
17 | Fvb1-1	14154.0	15323.0	+	Helitron	Helitron	1170.0
18 | Fvb1-1	15320.0	16343.0	-	Helitron	Helitron	1114.0
19 | Fvb1-1	20236.0	20466.0	-	LTR	Copia	231.0
20 | Fvb1-1	20465.0	20899.0	+	LTR	Unknown_LTR_Superfam	435.0
21 | Fvb1-1	20880.0	22846.0	-	LTR	Gypsy	1966.0
22 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/Annotate_EDTA_Rice_Glaberrima.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -login
 2 | 
 3 | #SBATCH -J EDTA_Glaberrima
 4 | #SBATCH --time=167:00:00
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=25
 7 | #SBATCH --mem-per-cpu=30G
 8 | #SBATCH -o EDTA_Glaberrima.out
 9 | #--------------------------------------------------------
10 | # NOTE the user should change these paths to match their machine
11 | EDTA_DIR=/mnt/research/edgerpat_lab/EDTA
12 | GENOME_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/Oryza_Glaberrima/Sequences
13 | OUT_DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/Oryza_Glaberrima/TEs
14 | 
15 | 
16 | # NOTE
17 | # Do these commands ahead of trying to run EDTA to properly install
18 | # git clone https://github.com/oushujun/EDTA.git
19 | # cd EDTA
20 | # conda env create -f EDTA.yml
21 | 
22 | module purge
23 | module load Conda/3
24 | conda activate EDTA  # activate the conda environment of packages
25 | 
26 | cd $OUT_DATA_DIR  # cd to output data dir for any extraneous files that get outputted
27 | 
28 | # Run EDTA
29 | perl $EDTA_DIR/EDTA.pl --genome $GENOME_DIR/Oryza_Glaberrima_NewNames.fasta --cds $GENOME_DIR/Oryza_Glaberrima_CDS_NewNames.fasta --sensitive 1 --anno 1 --threads 25
30 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_Preprocess_Cleaned_Genes.tsv:
--------------------------------------------------------------------------------
 1 | Gene_Name	Chromosome	Feature	Start	Stop	Strand	Length
 2 | maker-VaccDscaff1-snap-gene-0.31	VaccDscaff1	gene	893.0	24185.0	+	23293.0
 3 | maker-VaccDscaff1-snap-gene-0.32	VaccDscaff1	gene	24880.0	31033.0	+	6154.0
 4 | maker-VaccDscaff1-snap-gene-0.36	VaccDscaff1	gene	31517.0	32457.0	-	941.0
 5 | maker-VaccDscaff1-snap-gene-0.37	VaccDscaff1	gene	32462.0	39522.0	-	7061.0
 6 | snap_masked-VaccDscaff1-processed-gene-0.15	VaccDscaff1	gene	49288.0	54958.0	+	5671.0
 7 | maker-VaccDscaff1-augustus-gene-0.27	VaccDscaff1	gene	62151.0	89671.0	+	27521.0
 8 | maker-VaccDscaff1-augustus-gene-0.30	VaccDscaff1	gene	86018.0	89723.0	-	3706.0
 9 | maker-VaccDscaff1-augustus-gene-1.27	VaccDscaff1	gene	97467.0	99179.0	-	1713.0
10 | augustus_masked-VaccDscaff1-processed-gene-1.5	VaccDscaff1	gene	107244.0	110895.0	-	3652.0
11 | maker-VaccDscaff1-snap-gene-1.29	VaccDscaff1	gene	110385.0	118861.0	+	8477.0
12 | maker-VaccDscaff1-augustus-gene-1.24	VaccDscaff1	gene	128901.0	132573.0	+	3673.0
13 | maker-VaccDscaff1-augustus-gene-1.25	VaccDscaff1	gene	147894.0	149490.0	+	1597.0
14 | maker-VaccDscaff1-snap-gene-1.32	VaccDscaff1	gene	179007.0	185476.0	+	6470.0
15 | maker-VaccDscaff1-augustus-gene-1.28	VaccDscaff1	gene	186127.0	189575.0	-	3449.0
16 | 


--------------------------------------------------------------------------------
/examples/Human/src/TE_Density_Human.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -login
 2 | 
 3 | #SBATCH -J Human_Density
 4 | #SBATCH --time=165:00:00
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=2
 7 | #SBATCH --mem-per-cpu=90G
 8 | #SBATCH -o Human_Density.out
 9 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Human/Human_Density.out
10 | #--------------------------------------------------------
11 | echo ""
12 | echo "Job Information"
13 | echo "Job ID:" $SLURM_JOB_ID
14 | echo ""
15 | 
16 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density
17 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/filtered_input_data
18 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Human/
19 | GENOME="Human_Chrom_Subset"
20 | 
21 | # Load the Python on HPCC
22 | module purge
23 | module load GCC/10.2.0 Python/3.8.10
24 | 
25 | # Source the Python packages that are version controlled
26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate
27 | 
28 | # Go to project directory
29 | cd $ROOT_DIR
30 | 
31 | # Make output dir
32 | mkdir -p $OUT_DIR
33 | 
34 | # Run the code
35 | python process_genome.py $DATA_DIR/Cleaned_Chr7_13_Human_Genes.tsv $DATA_DIR/Cleaned_Chr7_13_Human_TEs.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini -n 2 --reset_h5 -o $OUT_DIR
36 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/TE_Density_Sativa.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -login
 2 | 
 3 | #SBATCH -J Sativa_Density
 4 | #SBATCH --time=24:00:00
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --mem-per-cpu=25G
 8 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Rice/Sativa_Density.out
 9 | #--------------------------------------------------------
10 | echo ""
11 | echo "Job Information"
12 | echo "Job ID:" $SLURM_JOB_ID
13 | echo ""
14 | 
15 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density
16 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/filtered_input_data/
17 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Rice/
18 | GENOME="Sativa"
19 | 
20 | 
21 | # Load the Python on HPCC
22 | module purge
23 | module load GCC/10.2.0 Python/3.8.10
24 | 
25 | # Source the Python packages that are version controlled
26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate
27 | 
28 | # Go to project directory
29 | cd $ROOT_DIR
30 | 
31 | # Make output dir
32 | mkdir -p $OUT_DIR
33 | 
34 | # Run the code
35 | python process_genome.py $DATA_DIR/Cleaned_Oryza_sativa.IRGSP-1.0.50.tsv $DATA_DIR/Cleaned_Oryza_Sativa_NewNames.fasta.mod.EDTA.TEanno.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini -n 12 --reset_h5 -o $OUT_DIR
36 | 


--------------------------------------------------------------------------------
/examples/Blueberry_Expression/src/TE_Density_Blueberry.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -login
 2 | 
 3 | #SBATCH -J Blueberry_TE_Density
 4 | #SBATCH --time=36:00:00
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=20
 7 | #SBATCH --mem-per-cpu=32G
 8 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Blueberry/Blueberry_Density.out
 9 | #--------------------------------------------------------
10 | echo ""
11 | echo "Job Information"
12 | echo "Job ID:" $SLURM_JOB_ID
13 | echo ""
14 | 
15 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density
16 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Blueberry/filtered_input_data
17 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Blueberry/
18 | GENOME="Vacc_Cory"
19 | 
20 | 
21 | # Load the Python on HPCC
22 | module purge
23 | module load GCC/10.2.0 Python/3.8.10
24 | 
25 | # Source the Python packages that are version controlled
26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate
27 | 
28 | # Go to project directory
29 | cd $ROOT_DIR
30 | 
31 | # Make output dir
32 | mkdir -p $OUT_DIR
33 | 
34 | # Run the code
35 | python $ROOT_DIR/process_genome.py $DATA_DIR/Cleaned_Blueberry_Genes.tsv $DATA_DIR/Cleaned_Blueberry_EDTA_TEs.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini  -n 20 --reset_h5 -o $OUT_DIR
36 | 


--------------------------------------------------------------------------------
/examples/Arabidopsis/src/TE_Density_Arabidopsis.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -login
 2 | 
 3 | #SBATCH -J Arabidopsis_Density
 4 | #SBATCH --time=12:00:00
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=5
 7 | #SBATCH --mem-per-cpu=8G
 8 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Arabidopsis/Arabidopsis_Density.out
 9 | #--------------------------------------------------------
10 | echo ""
11 | echo "Job Information"
12 | echo "Job ID:" $SLURM_JOB_ID
13 | echo ""
14 | 
15 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density
16 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/filtered_input_data/
17 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Arabidopsis/
18 | GENOME="Arabidopsis"
19 | 
20 | 
21 | # Load the Python on HPCC
22 | module purge
23 | module load GCC/10.2.0 Python/3.8.10
24 | 
25 | # Source the Python packages that are version controlled
26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate
27 | 
28 | # Go to project directory
29 | cd $ROOT_DIR
30 | 
31 | # Make output dir
32 | mkdir -p $OUT_DIR
33 | 
34 | # Run the code
35 | python process_genome.py $DATA_DIR/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv $DATA_DIR/Cleaned_TAIR10_chr_main_chromosomes.fas.mod.EDTA.TEanno.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini -n 5 -o $OUT_DIR
36 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/TE_Density_Glaberrima.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -login
 2 | 
 3 | #SBATCH -J Glaberrima_Density
 4 | #SBATCH --time=24:00:00
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --mem-per-cpu=20G
 8 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Rice/Glaberrima_Density.out
 9 | #--------------------------------------------------------
10 | echo ""
11 | echo "Job Information"
12 | echo "Job ID:" $SLURM_JOB_ID
13 | echo ""
14 | 
15 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density
16 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/filtered_input_data/
17 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Rice/
18 | GENOME="Glaberrima"
19 | 
20 | 
21 | # Load the Python on HPCC
22 | module purge
23 | module load GCC/10.2.0 Python/3.8.10
24 | 
25 | # Source the Python packages that are version controlled
26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate
27 | 
28 | # Go to project directory
29 | cd $ROOT_DIR
30 | 
31 | # Make output dir
32 | mkdir -p $OUT_DIR
33 | 
34 | # Run the code
35 | python process_genome.py $DATA_DIR/Cleaned_Oryza_glaberrima.Oryza_glaberrima_V1.50.tsv $DATA_DIR/Cleaned_Oryza_Glaberrima_NewNames.fasta.mod.EDTA.TEanno.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini -n 12 --reset_h5 -o $OUT_DIR
36 | 


--------------------------------------------------------------------------------
/transposon/test_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Sundry functions intended for testing and development.
 5 | """
 6 | 
 7 | import h5py
 8 | 
 9 | import pytest
10 | import tempfile
11 | 
12 | 
13 | @pytest.fixture
14 | def temp_dir():
15 |     """Yields a temporary directory."""
16 | 
17 |     # NOTE using a scope of 'module' doesn't appear to work
18 |     # if you import this, so default scope is used
19 |     with tempfile.TemporaryDirectory() as dir:
20 |         yield dir
21 | 
22 | 
23 | @pytest.fixture
24 | def temp_h5_file(temp_dir):  # FUTURE could be a contextmanager rather than fixture
25 |     """Yields a temporary HDF5 file."""
26 | 
27 |     file = tempfile.NamedTemporaryFile(dir=temp_dir, suffix=".h5")
28 |     with file as temp:
29 |         yield temp.name
30 | 
31 | 
32 | @pytest.fixture
33 | def temp_h5_context():  # FUTURE could be a contextmanager rather than fixture
34 |     """Yields an open HDF5 file, writes on close."""
35 | 
36 |     # MAGIC 1KB as a reasonable default
37 |     with tempfile.SpooledTemporaryFile(max_size=1024*1024) as temp:
38 |         # MAGIC h5py convention, 'a' is append
39 |         with h5py.File(temp, 'a') as file:
40 |             yield file
41 |             file.flush()  # NOTE likely don't need this
42 |             file.close()  # NOTE likely don't need this
43 | 


--------------------------------------------------------------------------------
/examples/Blueberry_Expression/Makefile:
--------------------------------------------------------------------------------
 1 | # scripts for running blueberry TE Density examples
 2 | # __file__ Makefile
 3 | # __author__ Scott Teresi
 4 | 
 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 6 | DEV_DATA := $(realpath $(ROOT_DIR)/../../../TE_Data/Example_Blueberry)
 7 | DEV_GENES := $(DEV_DATA)/Genes/Blueberry_Genes.gff
 8 | DEV_TES := $(DEV_DATA)/TEs/Blueberry_EDTA_TEs.gff
 9 | DEV_FILTERED := $(DEV_DATA)/filtered_input_data
10 | DEV_GENE_EXPRESSION := $(DEV_DATA)/Genes/Blueberry_TPM_All.tsv
11 | DEV_DENSITY_FILES := $(DEV_DATA)/../../TE_Data/Example_Blueberry/
12 | 
13 | filter_genes:
14 | 	@echo Filtering blueberry genes into appropriate format for TE Density
15 | 	@echo 
16 | 	python $(ROOT_DIR)/src/import_blueberry_gene_anno.py $(DEV_GENES) $(DEV_FILTERED)
17 | 
18 | filter_TEs:
19 | 	@echo Filtering blueberry TEs into appropriate format for TE Density
20 | 	@echo 
21 | 	python $(ROOT_DIR)/src/import_blueberry_EDTA.py $(DEV_TES) $(DEV_FILTERED)
22 | 
23 | calculate_TE_Density:
24 | 	@echo Running TE Density for blueberry
25 | 	sbatch $(ROOT_DIR)/src/TE_Density_Blueberry.sb
26 | 
27 | generate_expression_graphs:
28 | 	@echo Generating TE density vs. gene expression graphs
29 | 	@echo 
30 | 	mkdir -p $(ROOT_DIR)/results/graphs
31 | 	python $(ROOT_DIR)/src/compare_expression.py $(DEV_GENE_EXPRESSION) $(DEV_GENES) $(DEV_DENSITY_FILES) -o $(ROOT_DIR)/results/graphs
32 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # scripts for development
 2 | # __file__ Makefile
 3 | # __author__ Michael Teresi
 4 | 
 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 6 | 
 7 | SYS_TEST_DIR := tests/system_test_input_data
 8 | SYS_TEST_GENES := $(ROOT_DIR)/$(SYS_TEST_DIR)/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv
 9 | SYS_TEST_TES := $(ROOT_DIR)/$(SYS_TEST_DIR)/Cleaned_TAIR10_chr_main_chromosomes.fas.mod.EDTA.TEanno.tsv
10 | 
11 | 
12 | .PHONY: help
13 | help:               ## Show this help
14 | 	@grep -E '^[a-z_A-Z0-9^.(]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
15 | 
16 | 
17 | .PHONY: system_test
18 | system_test:        ## run system test on sample data
19 | 	mkdir -p ./tmp
20 | 	python $(ROOT_DIR)/process_genome.py $(SYS_TEST_GENES) $(SYS_TEST_TES) Test -o ./tmp
21 | 
22 | 
23 | .PHONY: system_clean
24 | system_clean:       ## clean the system test
25 | 	rm -rf ./tmp
26 | 
27 | 
28 | .PHONY: test
29 | test:               ## run the tests
30 | 	mkdir -p $(ROOT_DIR)/tests/test_h5_cache_loc
31 | 	mkdir -p $(ROOT_DIR)/tests/output_data
32 | 	pytest $(ROOT_DIR)
33 | 
34 | 
35 | .PHONY: flake8
36 | flake8:             ## run style guide
37 | 	flake8 $(ROOT_DIR)
38 | 
39 | 
40 | .PHONY: lint
41 | lint:               ## run linter
42 | 	pylint $(ROOT_DIR)/transposon
43 | 
44 | 
45 | .PHONY: tags
46 | tags:               ## run ctags
47 | 	ctags \
48 | 		$(ROOT_DIR)/*.py \
49 | 		$(ROOT_DIR)/transposon/*.py
50 | 


--------------------------------------------------------------------------------
/tests/unit/test_OverlapManager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Unit test OverlapManager.
 5 | """
 6 | 
 7 | __author__ = "Michael Teresi"
 8 | 
 9 | import pytest
10 | import os
11 | import tempfile
12 | 
13 | from transposon.overlap_manager import OverlapManager
14 | from transposon.transposon_data import TransposonData
15 | from transposon.gene_data import GeneData
16 | 
17 | N_SUBGENES = 4  # MAGIC arbitrary, enough for testing
18 | GENOME_ID = "FAKE_GENOME_ID"
19 | 
20 | 
21 | @pytest.fixture(scope="session")
22 | def temp_dir():
23 |     """Temporary directory."""
24 | 
25 |     with tempfile.TemporaryDirectory() as dir:
26 |         yield dir
27 | 
28 | 
29 | @pytest.fixture(scope="session")
30 | def temp_filenames(temp_dir):
31 |     names = [next(tempfile._get_candidate_names()) for _ in range(N_SUBGENES)]
32 |     paths = [os.path.join(temp_dir, n) for n in names]
33 |     return paths
34 | 
35 | 
36 | @pytest.fixture(scope="session")
37 | def sub_genes(temp_filenames):
38 | 
39 |     genes = [GeneData.mock(genome_id=GENOME_ID) for _ in range(len(temp_filenames))]
40 |     return [gene.write(path) for gene, path in zip(genes, temp_filenames)]
41 | 
42 | 
43 | @pytest.fixture(scope="session")
44 | def sub_transposons(temp_filenames):
45 | 
46 |     genes = [
47 |         TransposonData.mock(genome_id=GENOME_ID) for _ in range(len(temp_filenames))
48 |     ]
49 |     return [gene.write(path) for gene, path in zip(genes, temp_filenames)]
50 | 
51 | 
52 | # def test_init_nothrow(sub_genes, sub_transposons):
53 | # pass
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     pytest.main(["-s", __file__])  # for convenience
58 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_SingleC_MultiElongate_Superfam_Revision.tsv:
--------------------------------------------------------------------------------
 1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
 2 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
 3 | Fvb1-1	3795.0	4215.0	-	LTR	Copia	420.0
 4 | Fvb1-1	4209.0	4761.0	-	LTR	Copia	553.0
 5 | Fvb1-1	4670.0	5229.0	-	LTR	Gypsy	360.0
 6 | Fvb1-1	8459.0	9266.0	-	Completely_Unknown	Completely_Unknown	808.0
 7 | Fvb1-1	8794.0	9370.0	+	Completely_Unknown	Completely_Unknown	577.0
 8 | Fvb1-1	9256.0	9677.0	+	Completely_Unknown	Completely_Unknown	421.0
 9 | Fvb1-1	9678.0	11249.0	-	Completely_Unknown	Completely_Unknown	1572.0
10 | Fvb1-1	11248.0	11469.0	+	Completely_Unknown	Completely_Unknown	220.0
11 | Fvb1-1	11450.0	11886.0	+	Completely_Unknown	Completely_Unknown	298.0
12 | Fvb1-1	12193.0	12404.0	-	TIR	hAT	212.0
13 | Fvb1-1	13625.0	13774.0	+	Completely_Unknown	Completely_Unknown	150.0
14 | Fvb1-1	13799.0	13892.0	-	TIR	Mutator	94.0
15 | Fvb1-1	13892.0	14156.0	+	Helitron	Helitron	265.0
16 | Fvb1-1	14154.0	15323.0	+	Helitron	Helitron	1170.0
17 | Fvb1-1	15320.0	16343.0	-	Helitron	Helitron	1114.0
18 | Fvb1-1	16344.0	16966.0	-	LTR	Gypsy	623.0
19 | Fvb1-1	17033.0	17209.0	-	Completely_Unknown	Completely_Unknown	177.0
20 | Fvb1-1	20215.0	20599.0	+	TIR	hAT	385.0
21 | Fvb1-1	20236.0	20466.0	-	LTR	Unknown_LTR_Superfam	231.0
22 | Fvb1-1	20465.0	20899.0	+	LTR	Unknown_LTR_Superfam	435.0
23 | Fvb1-1	20880.0	22846.0	-	LTR	Unknown_LTR_Superfam	1966.0
24 | Fvb1-1	23216.0	23346.0	+	Completely_Unknown	Completely_Unknown	131.0
25 | Fvb1-1	23271.0	23459.0	-	TIR	Mutator	189.0
26 | Fvb1-1	23456.0	24419.0	-	TIR	CACTA	963.0
27 | Fvb1-1	24415.0	24625.0	-	TIR	CACTA	211.0
28 | Fvb1-1	24000.0	24500.0	-	DNA	MULE	501.0
29 | Fvb1-1	24200.0	24550.0	-	LTR	Copia	351.0
30 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_SingleC_SingleElongate_Superfam_Revision.tsv:
--------------------------------------------------------------------------------
 1 | Chromosome	Start	Stop	Strand	Order	SuperFamily	Length
 2 | Fvb1-1	3350.0	3957.0	-	LTR	Copia	608.0
 3 | Fvb1-1	3795.0	4133.0	-	LTR	Copia	339.0
 4 | Fvb1-1	4209.0	4761.0	-	LTR	Gypsy	553.0
 5 | Fvb1-1	4670.0	5229.0	-	LTR	Gypsy	560.0
 6 | Fvb1-1	8459.0	9266.0	-	Completely_Unknown	Completely_Unknown	808.0
 7 | Fvb1-1	8794.0	9370.0	+	Completely_Unknown	Completely_Unknown	577.0
 8 | Fvb1-1	9556.0	9677.0	+	Completely_Unknown	Completely_Unknown	122.0
 9 | Fvb1-1	9675.0	11249.0	-	Completely_Unknown	Completely_Unknown	1575.0
10 | Fvb1-1	11253.0	11469.0	+	Completely_Unknown	Completely_Unknown	217.0
11 | Fvb1-1	11450.0	11886.0	+	Completely_Unknown	Completely_Unknown	437.0
12 | Fvb1-1	12193.0	12404.0	-	TIR	hAT	212.0
13 | Fvb1-1	13625.0	13774.0	+	Completely_Unknown	Completely_Unknown	150.0
14 | Fvb1-1	13799.0	13892.0	-	TIR	Mutator	94.0
15 | Fvb1-1	13892.0	14156.0	+	Helitron	Helitron	265.0
16 | Fvb1-1	14154.0	15323.0	+	Helitron	Helitron	1170.0
17 | Fvb1-1	15397.0	16343.0	-	LTR	Unknown_LTR_Superfam	947.0
18 | Fvb1-1	16344.0	16966.0	-	LTR	Gypsy	623.0
19 | Fvb1-1	17033.0	17209.0	-	Completely_Unknown	Completely_Unknown	177.0
20 | Fvb1-1	20215.0	20599.0	+	TIR	hAT	385.0
21 | Fvb1-1	20236.0	20464.0	-	LTR	Unknown_LTR_Superfam	229.0
22 | Fvb1-1	20465.0	20899.0	+	LTR	Unknown_LTR_Superfam	435.0
23 | Fvb1-1	20880.0	22846.0	-	LTR	Unknown_LTR_Superfam	1966.0
24 | Fvb1-1	23216.0	23346.0	+	Completely_Unknown	Completely_Unknown	131.0
25 | Fvb1-1	23271.0	23459.0	-	TIR	Mutator	189.0
26 | Fvb1-1	23456.0	24419.0	-	TIR	CACTA	963.0
27 | Fvb1-1	24415.0	24625.0	-	TIR	CACTA	211.0
28 | Fvb1-1	24000.0	24500.0	-	DNA	MULE	501.0
29 | Fvb1-1	24200.0	24550.0	-	LTR	Copia	351.0
30 | 


--------------------------------------------------------------------------------
/tests/unit/test_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Unit test data.py
 5 | """
 6 | 
 7 | __author__ = "Michael Teresi"
 8 | 
 9 | import pytest
10 | 
11 | import numpy as np
12 | import pandas as pd
13 | 
14 | from transposon.gene_data import GeneData
15 | from transposon.gene_datum import GeneDatum
16 | from transposon.transposon_data import TransposonData
17 | 
18 | @pytest.fixture
19 | def gene_data():
20 |     """Default GeneData instance."""
21 | 
22 |     return GeneData.mock()
23 | 
24 | def test_init(gene_data):
25 |     """Does the initializer fail?"""
26 | 
27 |     pass
28 | 
29 | def test_subset_id_unique():
30 |     """Does the chromosome identifier work if the IDs are the same?"""
31 | 
32 |     genes = GeneData.mock(np.array([[0, 9], [10, 19]]))
33 |     same_chromosome_name = "this_name_is_consistent"
34 |     genes.chromosomes[0] = same_chromosome_name
35 |     genes.chromosomes[1] = same_chromosome_name
36 |     assert genes.chromosome_unique_id == same_chromosome_name
37 | 
38 | def test_subset_id_missing():
39 |     """Does the chromosome identifier raise if the IDs are missing?"""
40 | 
41 |     genes = GeneData.mock(np.array([]))
42 |     with pytest.raises(RuntimeError) as excinfo:
43 |         genes.chromosome_unique_id
44 | 
45 | def test_subset_id_not_unique():
46 |     """Does the property raise if the chromosome IDs aren't unique?"""
47 | 
48 |     genes = GeneData.mock(np.array([[0, 9], [10, 19]]))
49 |     genes.chromosomes[0] = "first_not_unique_chromosome_name"
50 |     genes.chromosomes[1] = "this_one_is_different"
51 |     with pytest.raises(RuntimeError) as excinfo:
52 |         genes.chromosome_unique_id
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     pytest.main(['-s', __file__])  # for convenience
57 | 


--------------------------------------------------------------------------------
/examples/Human/Makefile:
--------------------------------------------------------------------------------
 1 | # scripts for running human TE Density examples
 2 | # __file__ Makefile
 3 | # __author__ Scott Teresi
 4 | 
 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 6 | DEV_GFF_READ_EXECUTABLE := /home/scott/Documents/Uni/Research/gffread
 7 | #DEV_GFF_READ_EXECUTABLE := /mnt/research/edgerpat_lab/Scotty/gffread
 8 | DEV_DATA := $(realpath $(ROOT_DIR)/../../../TE_Density_Example_Data/Human)
 9 | #DEV_GENES := $(DEV_DATA)/Genes/Chr7_13_Human_Genes.tsv
10 | DEV_GENES := $(DEV_DATA)/Genes/gencode.v37.annotation.gff3
11 | #DEV_TES := $(DEV_DATA)/TEs/Chr7_13_Human_TEs.tsv
12 | DEV_TES := $(DEV_DATA)/TEs/Human_TEs.tsv
13 | DEV_PROD_CONF := $(ROOT_DIR)/../../config/production_run_config.ini
14 | 
15 | DEV_FILTERED := $(realpath $(ROOT_DIR)/../../../TE_Data/filtered_input_data)
16 | DEV_HDF5 := $(realpath $(ROOT_DIR)/../../../TE_Data/finalized_data/10KB)
17 | DEV_RESULTS:= $(realpath $(ROOT_DIR)/results)
18 | 
19 | filter_genes:
20 | 	@echo Filtering human genes into appropriate format for TE Density
21 | 	python $(ROOT_DIR)/src/import_human_gene_anno.py $(DEV_GENES)
22 | 
23 | filter_TEs:
24 | 	python $(ROOT_DIR)/src/import_human_te_anno.py $(DEV_TES)
25 | 
26 | examine_TE_levels:
27 | 	@echo
28 | 	python $(ROOT_DIR)/src/retrieve_info_of_genes.py $(DEV_HDF5)/Human/Human_Chrom_Subset_chr7.h5 $(DEV_FILTERED)/Cleaned_Chr7_13_Human_Genes.tsv 7 Human_Chr7 $(ROOT_DIR)/list_of_chr7_genes.txt
29 | 	@echo
30 | 	python $(ROOT_DIR)/src/retrieve_info_of_genes.py $(DEV_HDF5)/Human/Human_Chrom_Subset_chr13.h5 $(DEV_FILTERED)/Cleaned_Chr7_13_Human_Genes.tsv 13 Human_Chr13 $(ROOT_DIR)/list_of_chr13_genes.txt
31 | help:               ## Show this help.
32 | 	fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//'
33 | 


--------------------------------------------------------------------------------
/tests/unit/test_gene_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Unit test data.py
 5 | """
 6 | 
 7 | __author__ = "Michael Teresi"
 8 | 
 9 | import pytest
10 | 
11 | import numpy as np
12 | import pandas as pd
13 | 
14 | from transposon.gene_data import GeneData
15 | from transposon.gene_datum import GeneDatum
16 | from transposon.transposon_data import TransposonData
17 | 
18 | @pytest.fixture
19 | def gene_data():
20 |     """Default GeneData instance."""
21 | 
22 |     return GeneData.mock()
23 | 
24 | def test_init(gene_data):
25 |     """Does the initializer fail?"""
26 | 
27 |     pass
28 | 
29 | def test_subset_id_unique():
30 |     """Does the chromosome identifier work if the IDs are the same?"""
31 | 
32 |     genes = GeneData.mock(np.array([[0, 9], [10, 19]]))
33 |     same_chromosome_name = "this_name_is_consistent"
34 |     genes.chromosomes[0] = same_chromosome_name
35 |     genes.chromosomes[1] = same_chromosome_name
36 |     assert genes.chromosome_unique_id == same_chromosome_name
37 | 
38 | def test_subset_id_missing():
39 |     """Does the chromosome identifier raise if the IDs are missing?"""
40 | 
41 |     genes = GeneData.mock(np.array([]))
42 |     with pytest.raises(RuntimeError) as excinfo:
43 |         genes.chromosome_unique_id
44 | 
45 | def test_subset_id_not_unique():
46 |     """Does the property raise if the chromosome IDs aren't unique?"""
47 | 
48 |     genes = GeneData.mock(np.array([[0, 9], [10, 19]]))
49 |     genes.chromosomes[0] = "first_not_unique_chromosome_name"
50 |     genes.chromosomes[1] = "this_one_is_different"
51 |     with pytest.raises(RuntimeError) as excinfo:
52 |         genes.chromosome_unique_id
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     pytest.main(['-s', __file__])  # for convenience
57 | 


--------------------------------------------------------------------------------
/tests/unit/test_transposon_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Unit test transposon_data.py
 5 | """
 6 | 
 7 | __author__ = "Michael Teresi"
 8 | 
 9 | import pytest
10 | 
11 | import numpy as np
12 | import pandas as pd
13 | 
14 | from transposon.gene_data import GeneData
15 | from transposon.gene_datum import GeneDatum
16 | from transposon.transposon_data import TransposonData
17 | 
18 | @pytest.fixture
19 | def te_data():
20 |     """Default TransposonData instance."""
21 | 
22 |     return TransposonData.mock()
23 | 
24 | def test_init(te_data):
25 |     """Does the initializer fail?"""
26 | 
27 |     pass
28 | 
29 | def test_subset_id_unique():
30 |     """Does the chromosome identifier work if the IDs are the same?"""
31 | 
32 |     transposons = TransposonData.mock(np.array([[0, 9], [10, 19]]))
33 |     same_chromosome_name = "this_name_is_consistent"
34 |     transposons.chromosomes[0] = same_chromosome_name
35 |     transposons.chromosomes[1] = same_chromosome_name
36 |     assert transposons.chromosome_unique_id == same_chromosome_name
37 | 
38 | def test_subset_id_missing():
39 |     """Does the chromosome identifier raise if the IDs are missing?"""
40 | 
41 |     transposons = TransposonData.mock(np.array([]))
42 |     with pytest.raises(RuntimeError) as excinfo:
43 |         transposons.chromosome_unique_id
44 | 
45 | def test_subset_id_not_unique():
46 |     """Does the property raise if the chromosome IDs aren't unique?"""
47 | 
48 |     transposons = TransposonData.mock(np.array([[3, 9], [12, 25]]))
49 |     transposons.chromosomes[0] = "first_not_unique_chromosome_name"
50 |     transposons.chromosomes[1] = 'Different_chromosome_name_from_default'
51 |     with pytest.raises(RuntimeError) as excinfo:
52 |         transposons.chromosome_unique_id
53 | 
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     pytest.main(['-s', __file__])  # for convenience
58 | 


--------------------------------------------------------------------------------
/tests/unit/test_WorkerProcess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Unit test WorkerProcess.
 5 | """
 6 | 
 7 | __author__ = "Michael Teresi"
 8 | 
 9 | 
10 | from collections import namedtuple
11 | import coloredlogs
12 | import logging
13 | import os
14 | import pytest
15 | import multiprocessing
16 | 
17 | from transposon.worker import WorkerProcess
18 | 
19 | 
20 | class FakeWorker(WorkerProcess):
21 |     """Implementation of WorkerProcess for testing."""
22 | 
23 |     def execute_job(self, job):
24 |         """Arbitrary job to square an input."""
25 | 
26 |         logging.debug("got {}".format(job))
27 |         return (job * job, os.getpid())
28 | 
29 | 
30 | @pytest.fixture()
31 | def worker():
32 |     mgr = multiprocessing.Manager
33 |     input = multiprocessing.Queue()
34 |     output = multiprocessing.Queue()
35 |     stop = multiprocessing.Event()
36 |     yield FakeWorker(input, output, stop)
37 | 
38 | @pytest.fixture()
39 | def worker_running(worker):
40 |     """Yield a running process."""
41 |     worker.start()
42 |     yield worker
43 |     worker.stop_event.set()
44 |     worker.join()
45 | 
46 | def test_start_stop(worker):
47 |     """Can we start / stop the worker?"""
48 | 
49 |     worker.start()
50 |     worker.stop_event.set()
51 |     worker.join()
52 | 
53 | def test_answer(worker_running):
54 |     """Do we get feedback?"""
55 |     number = 4
56 |     worker_running.input.put(4)
57 |     answer, worker_pid = worker_running.output.get(timeout=1)
58 |     assert answer == number**2
59 | 
60 | def test_newprocess(worker_running):
61 |     """Is the worker in a different process?"""
62 | 
63 |     worker_running.input.put(4)
64 |     answer, worker_pid = worker_running.output.get(timeout=1)
65 |     assert os.getpid() != worker_pid
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     logger = logging.getLogger(__name__)
70 |     coloredlogs.install(level=logging.INFO)
71 |     pytest.main(['-s', __file__])  # for convenience
72 | 


--------------------------------------------------------------------------------
/transposon/import_filtered_genes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from transposon import check_nulls, check_strand
 3 | 
 4 | 
 5 | def import_filtered_genes(genes_input_path, logger):
 6 |     """
 7 |     Import the preprocessed gene annotation file. Read it as pandas dataframe
 8 | 
 9 |     genes_input_path (str): Path to cleaned input annotation file of genes
10 | 
11 |     logger (logging.Logger): Logging object
12 | 
13 |     Returns:
14 |         gene_data (pandas.core.frame.DataFrame): A pandas dataframe
15 |         representing the preprocessed gene annotation file
16 |     """
17 |     try:
18 |         gene_data = pd.read_csv(
19 |             genes_input_path,
20 |             header="infer",
21 |             sep="\t",
22 |             dtype={
23 |                 "Start": "float64",
24 |                 "Stop": "float64",
25 |                 "Length": "float64",
26 |                 "Chromosome": str,
27 |                 "Strand": str,
28 |                 "Feature": str,
29 |                 "Gene_Name": str,
30 |             },
31 |         )
32 |     except Exception as err:
33 |         msg = """
34 |             Error occurred while trying to read preprocessed gene
35 |             annotation file into a Pandas dataframe, please refer
36 |             to the README as to what information is expected
37 |             input file: %s
38 |             """
39 |         logger.critical(msg, genes_input_path)
40 |         raise err
41 | 
42 |     gene_data.set_index("Gene_Name", verify_integrity=True, inplace=True)
43 |     check_nulls(gene_data, logger)
44 |     check_strand(gene_data, logger)
45 | 
46 |     # NOTE edit Strand '.' values to be sense orientation as
47 |     # described in check_strand()
48 |     gene_data["Strand"].replace(to_replace={".": "+"}, inplace=True)
49 | 
50 |     # Sort for legibility
51 |     gene_data.sort_values(by=["Chromosome", "Start"], inplace=True)
52 | 
53 |     logger.info("import of preprocessed gene annotation... success!")
54 |     return gene_data
55 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/generate_pairs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Master code file. Control filtration of syntelog data and generate summary
 5 | table
 6 | """
 7 | 
 8 | __author__ = "Scott Teresi"
 9 | 
10 | import argparse
11 | import os
12 | import logging
13 | import coloredlogs
14 | 
15 | from import_syntelogs import import_syntelogs
16 | from transposon import check_nulls
17 | 
18 | 
19 | def process(
20 |     syntelog_input_file,
21 |     data_output_path,
22 | ):
23 |     # Import the synteny data from raw file
24 |     logger.info("Importing syntelogs: %s" % syntelog_input_file)
25 |     syntelogs = import_syntelogs(syntelog_input_file)
26 |     check_nulls(syntelogs, logger)
27 | 
28 |     # Wrap the data
29 |     file_to_save = os.path.join(data_output_path, "set_syntelogs.tsv")
30 |     logger.info("Writing syntelog data to disk: %s" % file_to_save)
31 |     syntelogs.to_csv(file_to_save, sep="\t", header=True, index=False)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     """Command line interface to link syntelogs together."""
36 | 
37 |     parser = argparse.ArgumentParser(description="Filter syntelogs")
38 |     path_main = os.path.abspath(__file__)
39 |     parser.add_argument(
40 |         "syntelog_input_file", type=str, help="parent path of syntelog file"
41 |     )
42 |     parser.add_argument(
43 |         "--output_directory",
44 |         "-o",
45 |         type=str,
46 |         help="parent path of output directory",
47 |         default=os.path.join(path_main, "../../../../examples/Rice_Synteny/results"),
48 |     )
49 | 
50 |     parser.add_argument(
51 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
52 |     )
53 | 
54 |     args = parser.parse_args()
55 |     args.syntelog_input_file = os.path.abspath(args.syntelog_input_file)
56 |     args.output_directory = os.path.abspath(args.output_directory)
57 |     log_level = logging.DEBUG if args.verbose else logging.INFO
58 |     logger = logging.getLogger(__name__)
59 |     coloredlogs.install(level=log_level)
60 | 
61 |     # Process
62 |     logger.info("Starting filtration...")
63 |     process(
64 |         args.syntelog_input_file,
65 |         args.output_directory,
66 |     )
67 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples:
 2 | The purpose of this repository is to demonstrate the usage of TE Density on multiple different genomes as well as show interesting application and data visualization opportunities using the TE Density's output data sets.
 3 | The makefile in each genome-specific directory contains the sequence of commands we used to perform each analysis.
 4 | 
 5 | 
 6 | ## Example 1 - Genome-Wide Trends of TE Presence Relative to Genes (Arabidopsis):
 7 | The TE Density toolkit may be used to investigate the relationship of genes and TE presence genome-wide.
 8 | Here, we create a dotplot of the average TE density values of all genes by TE type, upstream, intragenically, and downstream for chromosome 1 of the Arabidopsis genome.
 9 | 
10 | ### Step 1 - Create a CDS FASTA file for use in creating a TE annotation:
11 | Please reference the makefile command `create_CDS`, here we use the GFFRead tool to generate a CDS FASTA file which is useful in creating a TE annotation with EDTA.
12 | 
13 | ### Step 2 - Generate a TE Annotation with EDTA:
14 | Please reference the makefile command `run_EDTA_HPCC`, here we run EDTA on MSU's high-performance computing cluster (HPCC).
15 | 
16 | ### Step 3 - Preprocess each annotation file prior to running TE Density:
17 | Please reference the makefile commands `filter_genes` and `filter_TEs`, here we use the Python package Pandas to perform reformat the annotation files. This part will likely need to be custom tailored to the user's own annotation files.
18 | 
19 | ### Step 4 - Run TE Density:
20 | Please reference the makefile command `run_TE_Density_HPCC`, here we call `process_genome.py` for the Arabidopsis genome but do so in an SBATCH script because we are using the resources of the HPCC. It is better to err on the side of more RAM if the genome is taking too long to compute, it may have stalled out due to insufficient RAM.
21 | 
22 | ### Step 5 - Begin analysis of TE Density data:
23 | #### Generate dotplots of average TE Density values for all genes as the window changes:
24 | Please reference the makefile command `generate_dotplots` and its python file `generate_dotplots.py`. Here we initialize the DensityData class to aid in accessing the data in the output HDF5 files.
25 | 
26 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/README.md:
--------------------------------------------------------------------------------
 1 | # TE Density Comparisons of Syntelogs in Rice:
 2 | This README contains information pertaining to how we compared TE Density levels of syntelogs using two closely related rice genomes. Please refer to the `Makefile` for our explicit commands we used from start to finish in creating and using the data relevant to this example.
 3 | 
 4 | # Downloading of Rice FASTA and Gene Annotation Files:
 5 | 
 6 | # Generating a TE Annotation for Rice using EDTA:
 7 | First, we need to generate a TE annotation for the two rice genomes. We do this by creating a CDS FASTA for each genome and then fixing (shortening) the names of the sequences in both the regular FASTA and CDS FASTA to run EDTA without any warnings (it does not like long sequence ID names).
 8 | We use three main code blocks in the `Makefile` to get the necessary files prior to running EDTA:
 9 | 1. `create_CDS` utilizes `gffread` to create a CDS FASTA file using the regular FASTA file and a gene annotation.
10 | 2. `fix_fasta_names` alters the sequence ID names in the FASTA file so that they aren't too long for EDTA.
11 | 3. `fix_CDS_names` alters the sequence ID names in the FASTA file so that they aren't too long for EDTA.
12 | 4. `run_EDTA_HPCC` executes the commands required to run EDTA on the HPCC (the computing cluster of MSU). Please refer to the `Annotate_EDTA_Rice_Glaberrima.sb` and `Annotate_EDTA_Rice_Sativa.sb` files to see the commands and resources we used in generating the EDTA annotations.
13 | 
14 | # Perform the Pre-processing Steps of TE Density, Filter Both Annotation Files:
15 | Here please refer to `filter_genes` and `filter_TEs` to view the commands we invoked to pre-process our data. The `import_rice_gene_anno.py` and `import_rice_EDTA.py` are the primary scripts the user will need to edit for their own purposes. 
16 | 1. Rice gene annotation...
17 | 
18 | 
19 | 
20 | # Running SynMap:
21 | This section describes the methods to run [SynMap](https://genomevolution.org/CoGe/SynMap.pl) on CoGe. I ran SynMap with mostly [default options](https://genomevolution.org/wiki/index.php/SynMap), I did change one option: under *Merge Syntenic Blocks* I set it to `Quota Align Merge`. Here is the [link](https://genomevolution.org/r/1how2) for *Glaberrima vs Sativa*.
22 | 


--------------------------------------------------------------------------------
/examples/Blueberry_Expression/EDTA_Blueberry.out:
--------------------------------------------------------------------------------
 1 | 
 2 | ########################################################
 3 | ##### Extensive de-novo TE Annotator (EDTA) v1.9.7  ####
 4 | ##### Shujun Ou (shujun.ou.1@gmail.com)             ####
 5 | ########################################################
 6 | 
 7 | 
 8 | 
 9 | Fri Mar 19 14:21:25 EDT 2021	Dependency checking:
10 | 				All passed!
11 | 
12 | Fri Mar 19 14:22:40 EDT 2021	The longest sequence ID in the genome contains 18 characters, which is longer than the limit (15)
13 | 				Trying to reformat seq IDs...
14 | 				Attempt 1...
15 | 				Attempt 2...
16 | Fri Mar 19 14:23:25 EDT 2021	Seq ID conversion successful!
17 | 
18 | 	A CDS file /mnt/research/edgerpat_lab/Scotty/Blueberry_Data/Genome/Vacc_c_CoGe_CDS.fasta is provided via --cds. Please make sure this is the DNA sequence of coding regions only.
19 | 
20 | Fri Mar 19 14:24:05 EDT 2021	Obtain raw TE libraries using various structure-based programs: 
21 | Fri Mar 19 14:24:24 EDT 2021	Obtain raw TE libraries finished.
22 | 				All intact TEs found by EDTA: 
23 | 					Vaccinium_corymbosum.faa.mod.EDTA.intact.fa
24 | 					Vaccinium_corymbosum.faa.mod.EDTA.intact.gff3
25 | 
26 | Fri Mar 19 14:24:24 EDT 2021	Perform EDTA advance filtering for raw TE candidates and generate the stage 1 library: 
27 | 
28 | Fri Mar 19 16:50:16 EDT 2021	EDTA advance filtering finished.
29 | 
30 | Fri Mar 19 16:50:16 EDT 2021	Perform EDTA final steps to generate a non-redundant comprehensive TE library:
31 | 
32 | 				Use RepeatModeler to identify any remaining TEs that are missed by structure-based methods.
33 | 
34 | Mon Mar 22 05:38:12 EDT 2021	Clean up TE-related sequences in the CDS file with TEsorter:
35 | 
36 | 				Remove CDS-related sequences in the EDTA library:
37 | 
38 | Mon Mar 22 09:24:38 EDT 2021	EDTA final stage finished! You may check out:
39 | 				The final EDTA TE library: Vaccinium_corymbosum.faa.mod.EDTA.TElib.fa
40 | Mon Mar 22 09:24:38 EDT 2021	Perform post-EDTA analysis for whole-genome annotation:
41 | 
42 | Mon Mar 22 17:03:06 EDT 2021	TE annotation using the EDTA library has finished! Check out:
43 | 				Whole-genome TE annotation (total TE: 46.50%): Vaccinium_corymbosum.faa.mod.EDTA.TEanno.gff3
44 | 				Whole-genome TE annotation summary: Vaccinium_corymbosum.faa.mod.EDTA.TEanno.sum
45 | 				Low-threshold TE masking for MAKER gene annotation (masked: 14.91%): Vaccinium_corymbosum.faa.mod.MAKER.masked
46 | 
47 | 


--------------------------------------------------------------------------------
/transposon/import_filtered_TEs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from transposon import check_nulls
 3 | 
 4 | 
 5 | def import_filtered_TEs(tes_input_path, logger):
 6 |     """
 7 |     Import a pre-filtered TE file to the pipeline
 8 | 
 9 |     Args:
10 |         tes_input_path (str): String to path of pre-filtered TE file
11 | 
12 |         logger (logging.Logger): object to log messages
13 | 
14 |     Returns:
15 |         transposon_data (pandas.core.frame.DataFrame): A pandas dataframe
16 |         representing the preprocessed transposon annotation file
17 |     """
18 |     try:
19 |         transposon_data = pd.read_csv(
20 |             tes_input_path,
21 |             header="infer",
22 |             sep="\t",
23 |             dtype={
24 |                 "Start": "float64",
25 |                 "Stop": "float64",
26 |                 "Length": "float64",
27 |                 "Chromosome": str,
28 |                 "Strand": str,
29 |                 "Order": str,
30 |                 "SuperFamily": str,
31 |             },
32 |         )
33 |     except Exception as err:
34 |         msg = (
35 |             "Error occurred while trying to read preprocessed TE "
36 |             "annotation file into a Pandas dataframe, please refer "
37 |             "to the README as to what information is expected"
38 |         )
39 |         logger.critical(msg)
40 |         raise err
41 | 
42 |     # Check for missing data issues
43 |     check_nulls(transposon_data, logger)
44 | 
45 |     # Report out to user some quick data metrics
46 |     logger.info(diagnostic_cleaner_helper(transposon_data))
47 | 
48 |     # Sort for legibility
49 |     transposon_data.sort_values(by=["Chromosome", "Start"], inplace=True)
50 | 
51 |     logger.info("import of pre-filtered transposon annotation... success!")
52 | 
53 |     return transposon_data
54 | 
55 | 
56 | def diagnostic_cleaner_helper(TE_Data):
57 |     info = f"""
58 |     ---------------------------------
59 |     Filtered TE Annotation Information:
60 |     No. unique chromosomes: {len(TE_Data.Chromosome.unique())}
61 |     Unique chromosomes: {TE_Data.Chromosome.unique()}
62 | 
63 |     No. unique TE Orders: {len(TE_Data.Order.unique())}
64 |     Unique TE Orders: {TE_Data.Order.unique()}
65 | 
66 |     No. unique TE superfamilies: {len(TE_Data.SuperFamily.unique())}
67 |     Unique TE superfamilies: {TE_Data.SuperFamily.unique()}
68 |     ---------------------------------
69 |     """
70 |     return info
71 | 


--------------------------------------------------------------------------------
/tests/unit/test_preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Test preprocess
 5 | """
 6 | 
 7 | __author__ = "Scott Teresi"
 8 | 
 9 | import pytest
10 | import tempfile
11 | import pandas as pd
12 | 
13 | from transposon.gene_data import GeneData
14 | from transposon.transposon_data import TransposonData
15 | from transposon.preprocess import PreProcessor
16 | 
17 | 
18 | @pytest.fixture
19 | def gene_file(request):
20 |     """path to simple gene annotation file for testing."""
21 |     return request.param
22 | 
23 | 
24 | @pytest.fixture
25 | def transposon_file(request):
26 |     """
27 |     path to simple TE annotation file for testing. This file is special in
28 |     that it does not have corresponding chromosome IDs to the gene file.
29 |     """
30 |     return request.param
31 | 
32 | 
33 | @pytest.fixture()
34 | def temp_dir():
35 |     """Temporary directory."""
36 | 
37 |     with tempfile.TemporaryDirectory() as dir:
38 |         yield dir
39 | 
40 | 
41 | @pytest.mark.parametrize(
42 |     "gene_file",
43 |     [
44 |         "tests/input_data/Test_Preprocess_Cleaned_Genes.tsv",
45 |     ],
46 | )
47 | @pytest.mark.parametrize(
48 |     "transposon_file",
49 |     [
50 |         "tests/input_data/Test_Preprocess_Cleaned_TEs_Unequal_Chrom_IDs.tsv",
51 |         "tests/input_data/Test_Preprocess_Cleaned_TEs_Unequal_Number_Chrom.tsv",
52 |     ],
53 | )
54 | def test_validate_split(gene_file, transposon_file, temp_dir):
55 |     """
56 |     Does the program fail elegantly when you give it BAD input annotations that
57 |     do not have the right corresponding chromosomes?
58 | 
59 |     Args:
60 |         gene_file (str): string to filepath of a "cleaned" gene annotation
61 |         transposon_file (str): string to filepath of a "cleaned" TE annotation
62 |         temp_dir (str): string to filepath for temporary output dir
63 |     """
64 |     preprocessor_obj = PreProcessor(
65 |         gene_file,
66 |         transposon_file,
67 |         temp_dir,
68 |         False,  # reset h5 arg
69 |         "fake_genome_id",  # MAGIC genome ID arg
70 |         True,  # revise TE arg
71 |     )
72 |     gene_frame = preprocessor_obj._load_filtered_genes()
73 |     transposon_frame = preprocessor_obj._load_filtered_transposons()
74 |     transposon_frame = preprocessor_obj._revise_transposons(transposon_frame)
75 |     with pytest.raises(ValueError) as exc:
76 |         preprocessor_obj._split_wrt_chromosome(gene_frame, transposon_frame)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     pytest.main(["-s", __file__])
81 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/bargraphs.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import matplotlib.patches as mpatches
 3 | import os
 4 | 
 5 | 
 6 | def graph_barplot_density_differences(
 7 |     values,
 8 |     te_type,
 9 |     window_val,
10 |     direction,
11 |     number_of_zeros,
12 |     output_dir,
13 |     logger,
14 |     display=False,
15 |     align="left",
16 | ):
17 |     """
18 |     Plot a histogram of TE density differences between syntelog pairs
19 | 
20 |     Args:
21 |         values (list): A list of values representing the TE density differences
22 |         between syntelog pairs
23 | 
24 |         te_type (str): String representing the TE type being plotted
25 | 
26 |         window_val (int): Integer representing the current window of which the
27 |         data is being plotted
28 | 
29 |         direction (str): string representing whether or not the graphs are
30 |         coming from upstream or downstream TE density data
31 | 
32 |         number_of_zeros ():
33 | 
34 |         logger (logging.Logger): Object to log information to
35 | 
36 |         display (boolean): Defaults to False, if True shows the plot upon
37 |         generation with the plt.show() command
38 |     """
39 | 
40 |     # MAGIC, the bins to group density values for the histogram AND the values
41 |     # for the xticks on the xaxis
42 |     tick_bins = [
43 |         -1.0,
44 |         -0.9,
45 |         -0.8,
46 |         -0.7,
47 |         -0.6,
48 |         -0.5,
49 |         -0.4,
50 |         -0.3,
51 |         -0.2,
52 |         -0.1,
53 |         0,
54 |         0.1,
55 |         0.2,
56 |         0.3,
57 |         0.4,
58 |         0.5,
59 |         0.6,
60 |         0.7,
61 |         0.8,
62 |         0.9,
63 |         1.0,
64 |     ]
65 | 
66 |     plt.figure(figsize=(8, 6))
67 |     n, bins, patches = plt.hist(
68 |         values, bins=tick_bins, facecolor="blue", ec="black", alpha=0.5
69 |     )
70 |     plt.rcParams["xtick.labelsize"] = 7  # MAGIC set size of axis ticks
71 |     plt.ylabel("Number of Genes")
72 |     plt.xlabel("Difference in TE Density Values")
73 |     plt.title("O. glaberrima vs O. sativa")  # MAGIC genome name order here
74 |     N = mpatches.Patch(
75 |         label="Total Plotted Genes: %s \nTE type: %s \nWindow: %s \nDirection: %s \nNo. 0 Differences: %s"
76 |         % (len(values), te_type, window_val, direction, str(number_of_zeros))
77 |     )
78 |     plt.xticks(tick_bins)
79 |     plt.legend(handles=[N])
80 |     path = os.path.join(
81 |         output_dir,
82 |         (te_type + "_" + str(window_val) + "_" + direction + "_DensityDifferences.png"),
83 |     )
84 |     logger.info("Saving graph to: %s" % path)
85 |     plt.savefig(path)
86 |     if display:
87 |         plt.show()
88 |     plt.close()
89 | 


--------------------------------------------------------------------------------
/examples/Arabidopsis/Makefile:
--------------------------------------------------------------------------------
 1 | # scripts for running rice synteny TE Density examples
 2 | # __file__ Makefile
 3 | # __author__ Scott Teresi
 4 | 
 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 6 | DEV_GFF_READ_EXECUTABLE := /home/scott/Documents/Uni/Research/gffread
 7 | #DEV_GFF_READ_EXECUTABLE := /mnt/research/edgerpat_lab/Scotty/gffread
 8 | DEV_DATA := $(realpath $(ROOT_DIR)/../../../TE_Density_Example_Data/Arabidopsis)
 9 | DEV_ARAB_FASTA_DATA := $(DEV_DATA)/Sequences/TAIR10_chr_main_chromosomes.fas
10 | DEV_ARAB_GENES := $(DEV_DATA)/Genes/TAIR10_GFF3_genes_main_chromosomes.gff
11 | DEV_ARAB_EXP := $(DEV_DATA)/Genes/At-Expression_AtBrCv_4Wisecaver.csv
12 | DEV_ARAB_TEs := $(DEV_DATA)/TEs/TAIR10_chr_main_chromosomes.fas.mod.EDTA.TEanno.gff3
13 | DEV_FILTERED := $(realpath $(ROOT_DIR)/../../../TE_Data/filtered_input_data)
14 | DEV_HDF5 := $(realpath $(ROOT_DIR)/../../../TE_Data/finalized_data/10KB/Arabidopsis)
15 | DEV_RESULTS:= $(realpath $(ROOT_DIR)/results)
16 | 
17 | .PHONY: create_CDS fix_fasta_names fix_CDS_names
18 | 
19 | create_CDS:
20 | 	@echo
21 | 	@echo Creating CDS from GFF and fasta file for Arabidopsis
22 | 	$(DEV_GFF_READ_EXECUTABLE)/gffread -x $(DEV_DATA)/Sequences/Arabidopsis_CDS.fasta -g $(DEV_ARAB_FASTA_DATA) $(DEV_ARAB_GENES)
23 | 	@echo
24 | 
25 | run_EDTA_HPCC:
26 | 	@echo Running EDTA for Arabidopsis
27 | 	sbatch $(ROOT_DIR)/src/Annotate_EDTA_Arabidopsis_thaliana.sb
28 | 
29 | filter_genes:
30 | 	@echo Filtering Arabidopsis genes into appropriate format for TE Density
31 | 	python $(ROOT_DIR)/src/import_Arabidopsis_gene_anno.py $(DEV_ARAB_GENES)
32 | 
33 | filter_TEs:
34 | 	@echo Filtering Arabidopsis TEs into appropriate format for TE Density
35 | 	python $(ROOT_DIR)/src/import_Arabidopsis_EDTA.py $(DEV_ARAB_TEs)
36 | 
37 | run_TE_Density_HPCC:
38 | 	@echo Running TE Density for Glaberrima
39 | 	@echo sbatch file contains paths inside
40 | 	sbatch $(ROOT_DIR)/src/TE_Density_Arabidopsis.sb
41 | 
42 | generate_dotplots:
43 | 	@echo Generating TE density dotplot for Arabidopsis
44 | 	mkdir -p $(DEV_RESULTS)/graphs
45 | 	python $(ROOT_DIR)/src/generate_dotplots.py $(DEV_HDF5)/Arabidopsis_Chr1.h5 $(DEV_FILTERED)/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv 1 -o $(DEV_RESULTS)/graphs
46 | 
47 | compare_density_upstream_downstream:
48 | 	@echo Comparing upstream and downstream density arrays with chi-squared
49 | 	mkdir -p $(DEV_RESULTS)/chi_squared
50 | 	python $(ROOT_DIR)/src/chi_squared_density_comparison.py $(DEV_FILTERED)/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv $(DEV_HDF5) -o $(DEV_RESULTS)/chi_squared
51 | 
52 | compare_centromeric:
53 | 	@echo Comparing centromeric/pericentromeric and regular density arrays
54 | 	mkdir -p $(DEV_RESULTS)/centromeric
55 | 	python $(ROOT_DIR)/src/compare_centromeric_densities.py $(DEV_FILTERED)/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv $(DEV_HDF5) $(DEV_ARAB_EXP) -o $(DEV_RESULTS)/centromeric
56 | 


--------------------------------------------------------------------------------
/transposon/worker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Generic consumer / producer with Process.
 5 | 
 6 | Inherit and implement `Worker.execute_job`.
 7 | Implement your 'job' container, enqueue to the job queue.
 8 | Consumes jobs from the job queue, passes them to `execute_job`.
 9 | Stop processing using the Sentinel and / or stop event.
10 | """
11 | 
12 | from abc import ABC, abstractmethod
13 | from collections import namedtuple
14 | import logging
15 | from multiprocessing import Process
16 | import queue
17 | 
18 | Sentinel = namedtuple("Sentinel", [])
19 | 
20 | 
21 | class WorkerProcess(Process, ABC):
22 |     """Consumes jobs, produces results."""
23 | 
24 |     TIMEOUT = 0.2  # MAGIC arbitrary, affects time to shutdown if not using sentinel
25 | 
26 |     def __init__(self, job_queue, result_queue, stop_event):
27 |         """Initializer.
28 | 
29 |         Args:
30 |             job_queue(Queue): input queue
31 |             result_queue(Queue): output queue
32 |             stop_event(Event): end if set
33 |         """
34 | 
35 |         super().__init__()
36 |         self._logger = logging.getLogger(self.__class__.__name__)
37 |         self.input = job_queue
38 |         self.output = result_queue
39 |         self.stop_event = stop_event
40 | 
41 |     @abstractmethod
42 |     def execute_job(self, job):
43 |         """Target function for the worker.
44 | 
45 |         Args:
46 |             job (tuple): caller's container pulled from the job queue
47 |         Returns:
48 |             tuple: caller's container to be enqueued to the result queue
49 |         """
50 | 
51 |         pass
52 | 
53 |     def run(self):
54 |         """Get jobs, put results.
55 | 
56 |         Returns if the stop event is set or the sentinel is received.
57 |         """
58 | 
59 |         job = None
60 |         result = None
61 |         while not self.stop_event.is_set():
62 | 
63 |             if not self._send_result(result):
64 |                 continue
65 |             else:
66 |                 result = None
67 | 
68 |             try:
69 |                 job = job or self.input.get(timeout=self.TIMEOUT)
70 |             except queue.Empty:
71 |                 continue
72 |             except KeyboardInterrupt:
73 |                 break
74 |             else:
75 |                 if isinstance(job, Sentinel):
76 |                     self.input.put_nowait(job)
77 |                     break
78 | 
79 |             result = self.execute_job(job)
80 |             job = None
81 | 
82 |     def _send_result(self, result):
83 |         """True if the result was enqueued, True if 'result' is None."""
84 | 
85 |         if result is None:
86 |             return True
87 | 
88 |         success = False
89 |         try:
90 |             self.output.put(result, timeout=self.TIMEOUT)
91 |         except queue.Full:
92 |             self._logger.warning("output queue is full!")
93 |         else:
94 |             sucess = True
95 | 
96 |         return True
97 | 
98 | 


--------------------------------------------------------------------------------
/tests/input_data/Test_Genes_NormMatrix.tsv:
--------------------------------------------------------------------------------
 1 | Gene_Name	Chromosome	Feature	Start	Stop	Strand	Length
 2 | maker-Fvb1-1-snap-gene-0.15	Fvb1-1	gene	41.0	2396.0	+	2356.0
 3 | maker-Fvb1-1-augustus-gene-0.13	Fvb1-1	gene	5556.0	7978.0	-	2423.0
 4 | maker-Fvb1-1-snap-gene-0.18	Fvb1-1	gene	8487.0	8797.0	-	311.0
 5 | snap_masked-Fvb1-1-processed-gene-0.6	Fvb1-1	gene	9361.0	9658.0	+	298.0
 6 | augustus_masked-Fvb1-1-processed-gene-0.4	Fvb1-1	gene	11127.0	11411.0	-	285.0
 7 | maker-Fvb1-1-snap-gene-0.16	Fvb1-1	gene	84598.0	86703.0	+	2106.0
 8 | maker-Fvb1-1-augustus-gene-3.19	Fvb1-1	gene	314397.0	317655.0	-	3259.0
 9 | maker-Fvb1-1-snap-gene-3.20	Fvb1-1	gene	315831.0	317608.0	+	1778.0
10 | augustus_masked-Fvb1-1-processed-gene-3.0	Fvb1-1	gene	319026.0	320584.0	+	1559.0
11 | maker-Fvb1-1-augustus-gene-3.17	Fvb1-1	gene	356220.0	357714.0	+	1495.0
12 | maker-Fvb1-1-snap-gene-3.24	Fvb1-1	gene	363853.0	364215.0	-	363.0
13 | maker-Fvb1-1-snap-gene-3.26	Fvb1-1	gene	364783.0	372216.0	-	7434.0
14 | augustus_masked-Fvb1-1-processed-gene-3.8	Fvb1-1	gene	374758.0	376307.0	-	1550.0
15 | maker-Fvb1-1-augustus-gene-3.18	Fvb1-1	gene	386791.0	388690.0	+	1900.0
16 | maker-Fvb1-1-augustus-gene-4.22	Fvb1-1	gene	412453.0	424954.0	-	12502.0
17 | maker-Fvb1-1-augustus-gene-4.23	Fvb1-1	gene	426581.0	435854.0	-	9274.0
18 | snap_masked-Fvb1-1-processed-gene-4.8	Fvb1-1	gene	437432.0	440233.0	+	2802.0
19 | snap_masked-Fvb1-1-processed-gene-4.10	Fvb1-1	gene	441720.0	441908.0	+	189.0
20 | augustus_masked-Fvb1-1-processed-gene-4.1	Fvb1-1	gene	454008.0	454529.0	+	522.0
21 | maker-Fvb1-1-snap-gene-4.25	Fvb1-1	gene	454566.0	457877.0	+	3312.0
22 | augustus_masked-Fvb1-1-processed-gene-4.3	Fvb1-1	gene	457884.0	462352.0	+	4469.0
23 | maker-Fvb1-1-snap-gene-4.29	Fvb1-1	gene	462652.0	465952.0	-	3301.0
24 | maker-Fvb1-1-augustus-gene-5.10	Fvb1-1	gene	523960.0	528517.0	+	4558.0
25 | maker-Fvb1-1-augustus-gene-5.11	Fvb1-1	gene	558851.0	561043.0	-	2193.0
26 | snap_masked-Fvb1-1-processed-gene-5.7	Fvb1-1	gene	568624.0	574304.0	-	5681.0
27 | snap_masked-Fvb1-1-processed-gene-6.4	Fvb1-1	gene	598901.0	599104.0	+	204.0
28 | maker-Fvb1-1-augustus-gene-6.12	Fvb1-1	gene	599215.0	600680.0	+	1466.0
29 | maker-Fvb1-1-augustus-gene-6.15	Fvb1-1	gene	600967.0	602849.0	-	1883.0
30 | snap_masked-Fvb1-1-processed-gene-6.11	Fvb1-1	gene	618100.0	622575.0	-	4476.0
31 | maker-Fvb1-1-snap-gene-6.19	Fvb1-1	gene	641786.0	649783.0	+	7998.0
32 | maker-Fvb1-1-augustus-gene-6.14	Fvb1-1	gene	660932.0	662110.0	+	1179.0
33 | maker-Fvb1-1-snap-gene-7.16	Fvb1-1	gene	690039.0	694198.0	+	4160.0
34 | maker-Fvb1-1-snap-gene-7.17	Fvb1-1	gene	706484.0	718558.0	+	12075.0
35 | snap_masked-Fvb1-1-processed-gene-7.12	Fvb1-1	gene	719899.0	720210.0	-	312.0
36 | maker-Fvb1-1-augustus-gene-7.15	Fvb1-1	gene	723171.0	724243.0	+	1073.0
37 | snap_masked-Fvb1-1-processed-gene-8.8	Fvb1-1	gene	797822.0	799572.0	+	1751.0
38 | maker-Fvb1-1-augustus-gene-8.23	Fvb1-1	gene	799576.0	801525.0	-	1950.0
39 | snap_masked-Fvb1-1-processed-gene-8.11	Fvb1-1	gene	804213.0	804524.0	+	312.0
40 | snap_masked-Fvb1-1-processed-gene-8.16	Fvb1-1	gene	854341.0	858564.0	-	4224.0
41 | snap_masked-Fvb1-1-processed-gene-8.17	Fvb1-1	gene	858574.0	858837.0	-	264.0
42 | 


--------------------------------------------------------------------------------
/transposon/notes:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | DESIGN
  4 | 
  5 | - refactor merge data to do multiple processes per chromosome
  6 |     - so it doesn't suck
  7 |     - allow rewrite of files (open and close)
  8 |     - allow bitmap to track completed jobs (so I know which jobs need to be done)
  9 |     - so density calculators don't suck
 10 | """
 11 | 
 12 | for each chromosome
 13 |     process = new_process(chromosome) # BAD
 14 |         for each gene in process;
 15 |             for each entry in gene / TE / window ...
 16 |                 calculate
 17 |                 insert
 18 | 
 19 | 
 20 | for each chromosome
 21 |     workers = make_workers_for_a_chromosome(chrom)
 22 | 
 23 |     # split
 24 |     # mega job is composed of little jobs
 25 |     jobs = make_jobs(gene_subset, te_subset, window_subset)  # pseudo-split
 26 | 
 27 |     # merge
 28 |     # for each result from the jobs, combine in some way
 29 |     merge_worker = make_merger()  #
 30 |     better_density_data = new_density_data()
 31 |     with merge_worker as my_merger:
 32 |         result = get_result()
 33 |         better_density_data.insert_result(result)
 34 | 
 35 | 
 36 | do_merge
 37 |     receive result
 38 | 
 39 |     insert result
 40 | 
 41 |     transmit success/failure
 42 | 
 43 | do_work
 44 |     receive job
 45 | 
 46 |     create_subset of density
 47 |     loop
 48 | 
 49 |     transmit job
 50 | 
 51 | 
 52 | # USER STORIES
 53 | 
 54 | ## really basic access
 55 | 
 56 | I just need density for one thing....(window/direction/te-type)
 57 | 
 58 | can we add a column to the gene data panda frame,
 59 | 	that column would be a specify TE density value for window/direction/te-type
 60 | 	we aren't limited to adding to a gene data frame
 61 | 
 62 | give me TE density values for all genes (2Darray?), e.g.:
 63 | 
 64 | for all genes
 65 | 	for one TE type
 66 | 		for one window
 67 | 			for one direction
 68 | 
 69 | 
 70 | 
 71 | class NewHotness:
 72 | 
 73 |     def __init__(self):
 74 |         pass
 75 | 
 76 | 
 77 | def fixture_density():
 78 |     
 79 | 
 80 | def test_muhgenes():
 81 |     """ """
 82 | 
 83 |     density = NewHotness()
 84 |     density.muhgenes(args...)  # what are these args?
 85 | 
 86 | 
 87 | 
 88 | @dataclass
 89 | class DensityAccessor():
 90 | 	genes
 91 | 	density
 92 | 	te_order (Order|SuperFamily)
 93 | 	te_name (LTR...)
 94 | 	direction (left|intra|right)
 95 | 	window (100 000 000...)
 96 | 	
 97 | 	def muhdensity():
 98 | 	"""Convenience function to get some values."""
 99 | 		return pandaframe()
100 | 
101 | 	def muhgenes():
102 | 	"""Generator to produce the densities across the genes"""
103 | 		for gene in allthegenes:
104 | 			self.gene = gene
105 | 			yield self.mudensity()
106 | 
107 | 
108 | ## what the heck are all these chromosome files, I just want my genome!
109 | 
110 | having all of the files split up by chromosome may make logical sense
111 | from a density calculation standpoint b/c each on is indepenent
112 | BUT it's annoying when making analysis
113 | 
114 | can we make one H5 file to rule them all?
115 | yes
116 | 	maybe just have one layer of abstraction, index each chromosome
117 | 	maybe concatenate the data? need to maintain what chromosome it came from
118 | 
119 | 
120 | 	
121 | 	
122 | 


--------------------------------------------------------------------------------
/tests/test_transposon.tsv:
--------------------------------------------------------------------------------
 1 | Fvb1-1  RepeatMasker    DNA/Mutator     3350    4133    5.0     +       4992    Fxa_V1_4_26930
 2 | Fvb1-1  RepeatMasker    LTR/Gypsy       4209    4873    11.7    -       2969    Fvb4-4:6485593..6486917_LTR
 3 | Fvb1-1  RepeatMasker    LTR/Gypsy       4871    5232    16.0    -       1185    Fvb4-4:6485593..6486917_LTR
 4 | Fvb1-1  RepeatMasker    LTR/unknown     15404   16346   19.1    -       3262    Fvb1-1:9463282..9465926_LTR
 5 | Fvb1-1  RepeatMasker    LTR/unknown     16347   20224   3.8     -       30004   Fvb3-2:23517681..23521581_INT-int
 6 | Fvb1-1  RepeatMasker    LTR/unknown     20225   22848   15.4    -       12678   Fvb1-1:9463282..9465926_LTR
 7 | Fvb1-1  RepeatMasker    LTR/Gypsy       23450   24094   20.3    -       2636    Fvb1-1:5519794..5522056_LTR
 8 | Fvb1-1  RepeatMasker    LTR/unknown     24133   24415   19.0    -       1082    Fvb1-3:25198782..25200749_LTR
 9 | Fvb1-1  RepeatMasker    DNA/CMC-EnSpm   24412   24910   8.5     +       2257    family-72
10 | Fvb1-1  RepeatMasker    DNA/CMC-EnSpm   25024   25752   18.8    +       1671    family-72
11 | Fvb1-1  RepeatMasker    DNA/CMC-EnSpm   25751   25840   21.4    +       351     family-72
12 | Fvb1-1  RepeatMasker    LTR/unknown     25842   27393   5.9     +       10968   Fvb6-4:17811941..17814568_LTR
13 | Fvb1-1  RepeatMasker    LTR/Gypsy       27398   27999   19.6    +       1680    Fvb4-2:14377692..14383914_LTR
14 | Fvb1-1  RepeatMasker    LTR/Gypsy       28097   28708   21.1    +       1651    Fvb4-2:14377692..14383914_LTR
15 | Fvb1-1  RepeatMasker    LTR/unknown     28709   29048   4.4     -       2681    Fvb6-3:14035592..14036822_LTR
16 | Fvb1-1  RepeatMasker    LTR/unknown     29051   30034   5.9     -       7242    Fvb7-3:21810512..21811992_LTR
17 | Fvb1-1  RepeatMasker    LTR/unknown     30038   30447   20.0    +       1455    Fvb1-2:27872851..27875385_LTR
18 | Fvb1-1  RepeatMasker    LTR/Gypsy       30440   31593   15.4    -       5517    Fvb1-1:1901529..1903797_LTR
19 | Fvb1-1  RepeatMasker    LTR/Gypsy       30986   31928   12.8    +       3981    Fvb1-2:23736536..23738711_LTR
20 | Fvb1-1  RepeatMasker    LTR/Gypsy       32011   33581   7.7     +       11550   Fvb6-4:12909946..12918059_INT-int
21 | Fvb1-1  RepeatMasker    LTR/unknown     33582   33661   11.2    -       524     Fvb4-1:5166560..5169132_LTR
22 | Fvb1-1  RepeatMasker    LTR/unknown     33646   33720   17.3    +       409     Fvb1-3:21882267..21885047_LTR
23 | Fvb1-1  RepeatMasker    LTR/Gypsy       33721   34100   11.8    +       2503    Fvb4-2:19625958..19630035_INT-int
24 | Fvb1-1  RepeatMasker    LTR/Gypsy       34101   36264   20.8    +       7112    Fvb3-1:4132469..4134698_LTR
25 | Fvb1-1  RepeatMasker    LTR/Gypsy       36265   38219   20.9    +       6733    Fvb5-1:12085737..12087980_LTR
26 | Fvb1-1  RepeatMasker    LTR/Gypsy       38514   40380   19.2    +       6488    Fvb6-2:9966870..9969479_LTR
27 | Fvb1-1  RepeatMasker    LTR/unknown     40383   41342   5.3     -       7215    Fvb1-2:20827613..20830221_LTR
28 | Fvb1-1  RepeatMasker    LTR/Gypsy       41343   41699   12.5    -       7196    Fvb1-1:1901529..1903797_LTR
29 | Fvb1-1  RepeatMasker    LTR/Gypsy       41469   42958   8.3     +       9764    Fvb1-4:19462751..19465717_LTR
30 | Fvb1-1  RepeatMasker    LTR/unknown     42959   45118   8.4     +       14420   Fvb6-4:10839685..10850190_INT-int


--------------------------------------------------------------------------------
/examples/Blueberry_Expression/src/import_blueberry_gene_anno.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Filter a gene annotation file for the TE Density algorithm
  3 | """
  4 | 
  5 | __author__ = "Scott Teresi"
  6 | 
  7 | import pandas as pd
  8 | import argparse
  9 | import os
 10 | import logging
 11 | import coloredlogs
 12 | 
 13 | 
 14 | def write_cleaned_genes(gene_pandaframe, output_dir, genome_name, logger):
 15 |     file_name = os.path.join(output_dir, ("Cleaned_" + genome_name + "_Genes.tsv"))
 16 | 
 17 |     logger.info("Writing cleaned gene file to: %s" % file_name)
 18 |     gene_pandaframe.to_csv(file_name, sep="\t", header=True, index=True)
 19 | 
 20 | 
 21 | def import_genes(genes_input_path, logger):
 22 |     """Import genes file.
 23 | 
 24 |     Args:
 25 |         input_dir (command line argument) Specify the input directory of the gene
 26 |         annotation data, this is the same as the TE annotation directory
 27 |     """
 28 | 
 29 |     col_names = [
 30 |         "Chromosome",
 31 |         "Software",
 32 |         "Feature",
 33 |         "Start",
 34 |         "Stop",
 35 |         "Score",
 36 |         "Strand",
 37 |         "Frame",
 38 |         "FullName",
 39 |     ]
 40 | 
 41 |     col_to_use = [
 42 |         "Chromosome",
 43 |         "Software",
 44 |         "Feature",
 45 |         "Start",
 46 |         "Stop",
 47 |         "Strand",
 48 |         "FullName",
 49 |     ]
 50 | 
 51 |     Gene_Data = pd.read_csv(
 52 |         genes_input_path,
 53 |         sep="\t+",
 54 |         header=None,
 55 |         engine="python",
 56 |         names=col_names,
 57 |         usecols=col_to_use,
 58 |         dtype={"Stop": "float64", "Start": "float64"},
 59 |         comment="#",
 60 |     )
 61 | 
 62 |     # rows in annotation
 63 |     Gene_Data = Gene_Data[Gene_Data.Feature == "gene"]  # drop non-gene rows
 64 | 
 65 |     # clean the names and set as the index (get row wrt name c.f. idx)
 66 | 
 67 |     Gene_Data["Gene_Name"] = Gene_Data["FullName"].str.extract(r"ID=(.*?);")
 68 | 
 69 |     Gene_Data.set_index("Gene_Name", inplace=True)
 70 |     Gene_Data = Gene_Data.drop(columns=["FullName", "Software"])
 71 | 
 72 |     Gene_Data.Strand = Gene_Data.Strand.astype(str)
 73 | 
 74 |     Gene_Data["Length"] = Gene_Data.Stop - Gene_Data.Start + 1
 75 | 
 76 |     Gene_Data.sort_values(by=["Chromosome", "Start"], inplace=True)
 77 |     # MAGIC I only want the first 48 chromosomes
 78 |     chromosomes_i_want = ["VaccDscaff" + str(i) for i in range(49)]
 79 |     Gene_Data = Gene_Data.loc[Gene_Data["Chromosome"].isin(chromosomes_i_want)]
 80 |     return Gene_Data
 81 | 
 82 | 
 83 | if __name__ == "__main__":
 84 | 
 85 |     parser = argparse.ArgumentParser(description="Reformat gene annotation file")
 86 |     parser.add_argument(
 87 |         "gene_input_file", type=str, help="Parent path of gene annotation file"
 88 |     )
 89 |     parser.add_argument(
 90 |         "output_dir",
 91 |         type=str,
 92 |         help="Parent directory to output results",
 93 |     )
 94 |     parser.add_argument(
 95 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
 96 |     )
 97 | 
 98 |     args = parser.parse_args()
 99 |     args.gene_input_file = os.path.abspath(args.gene_input_file)
100 |     args.output_dir = os.path.abspath(args.output_dir)
101 | 
102 |     log_level = logging.DEBUG if args.verbose else logging.INFO
103 |     logger = logging.getLogger(__name__)
104 |     coloredlogs.install(level=log_level)
105 | 
106 |     # Execute
107 |     cleaned_genes = import_genes(args.gene_input_file, logger)
108 |     write_cleaned_genes(cleaned_genes, args.output_dir, "Blueberry", logger)
109 | 


--------------------------------------------------------------------------------
/examples/Human/src/import_human_te_anno.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Reformat the human TE file into a format conducive to the Transposon_Data class
  5 | """
  6 | 
  7 | __author__ = "Scott Teresi"
  8 | 
  9 | import pandas as pd
 10 | import argparse
 11 | import logging
 12 | import os
 13 | import coloredlogs
 14 | 
 15 | from examples.Human.src.replace_human_TE_names import te_annot_renamer
 16 | 
 17 | 
 18 | def write_cleaned_TEs(te_pandaframe, output_dir, genome_name, logger):
 19 |     file_name = os.path.join(
 20 |         output_dir, ("Cleaned_Chr7_13_" + genome_name + "_TEs.tsv")
 21 |     )
 22 | 
 23 |     logger.info("Writing cleaned TE file to: %s" % file_name)
 24 |     te_pandaframe.to_csv(file_name, sep="\t", header=True, index=False)
 25 | 
 26 | 
 27 | def import_human_tes(human_TE_file, logger):
 28 |     """
 29 |     We want a pandas object with Chromosome, Start, Stop, Strand, Order, SuperFamily and Length as columns
 30 |     """
 31 |     data = pd.read_csv(
 32 |         human_TE_file,
 33 |         header="infer",
 34 |         sep="\t",
 35 |         dtype={
 36 |             "genoStart": "float64",
 37 |             "genoEnd": "float64",
 38 |             "strand": str,
 39 |             "genoName": str,
 40 |             "repClass": str,
 41 |             "repFamily": str,
 42 |         },
 43 |     )
 44 |     data.drop(
 45 |         columns=[
 46 |             "#bin",
 47 |             "swScore",
 48 |             "milliDiv",
 49 |             "milliIns",
 50 |             "milliDel",
 51 |             "id",
 52 |             "genoLeft",
 53 |             "repStart",
 54 |             "repEnd",
 55 |             "repLeft",
 56 |             "repName",
 57 |         ],
 58 |         inplace=True,
 59 |     )
 60 |     data.rename(
 61 |         columns={
 62 |             "genoName": "Chromosome",
 63 |             "repClass": "Order",
 64 |             "repFamily": "SuperFamily",
 65 |             "strand": "Strand",
 66 |             "genoStart": "Start",
 67 |             "genoEnd": "Stop",
 68 |         },
 69 |         inplace=True,
 70 |     )
 71 | 
 72 |     data["Length"] = data.Stop - data.Start + 1
 73 |     # NOTE only grabbing specific chromosomes
 74 |     chromosomes_i_want = ["chr7", "chr13"]  # MAGIC
 75 |     data = data.loc[data["Chromosome"].isin(chromosomes_i_want)]
 76 | 
 77 |     data = te_annot_renamer(data)
 78 | 
 79 |     return data
 80 | 
 81 | 
 82 | if __name__ == "__main__":
 83 |     """Command line interface to calculate density."""
 84 | 
 85 |     parser = argparse.ArgumentParser(description="Reformat TE annotation file")
 86 |     path_main = os.path.abspath(__file__)
 87 |     dir_main = os.path.dirname(path_main)
 88 |     output_default = os.path.join(
 89 |         dir_main, "../../../../", "TE_Data/filtered_input_data"
 90 |     )
 91 |     parser.add_argument(
 92 |         "TE_input_file", type=str, help="Parent path of TE annotation file"
 93 |     )
 94 | 
 95 |     parser.add_argument(
 96 |         "--output_dir",
 97 |         "-o",
 98 |         type=str,
 99 |         default=output_default,
100 |         help="Parent directory to output results",
101 |     )
102 | 
103 |     parser.add_argument(
104 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
105 |     )
106 | 
107 |     args = parser.parse_args()
108 |     args.TE_input_file = os.path.abspath(args.TE_input_file)
109 |     args.output_dir = os.path.abspath(args.output_dir)
110 | 
111 |     log_level = logging.DEBUG if args.verbose else logging.INFO
112 |     logger = logging.getLogger(__name__)
113 |     coloredlogs.install(level=log_level)
114 | 
115 |     cleaned_tes = import_human_tes(args.TE_input_file, logger)
116 |     write_cleaned_TEs(cleaned_tes, args.output_dir, "Human", logger)
117 | 


--------------------------------------------------------------------------------
/tests/unit/test_import_genes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Unit test GeneDatum
  5 | """
  6 | 
  7 | __author__ = "Scott Teresi"
  8 | 
  9 | import os
 10 | import pytest
 11 | import numpy as np
 12 | import pandas as pd
 13 | from io import StringIO
 14 | 
 15 | 
 16 | col_names = [
 17 |     "Chromosome",
 18 |     "Software",
 19 |     "Feature",
 20 |     "Start",
 21 |     "Stop",
 22 |     "Score",
 23 |     "Strand",
 24 |     "Frame",
 25 |     "FullName",
 26 | ]
 27 | 
 28 | col_to_use = [
 29 |     "Chromosome",
 30 |     "Software",
 31 |     "Feature",
 32 |     "Start",
 33 |     "Stop",
 34 |     "Strand",
 35 |     "FullName",
 36 | ]
 37 | 
 38 | gene_anno_path = "tests/input_data/Test_Gene_Anno_Float_Conversion.tsv"
 39 | 
 40 | 
 41 | @pytest.fixture
 42 | def import_as_float32():
 43 |     gene_anno = pd.read_csv(
 44 |         gene_anno_path,
 45 |         sep="\t+",
 46 |         header=None,
 47 |         engine="python",
 48 |         names=col_names,
 49 |         usecols=col_to_use,
 50 |         dtype={"Start": "float32", "Stop": "float32"},
 51 |         comment="#",
 52 |     )
 53 |     gene_anno = gene_anno[gene_anno.Feature == "gene"]  # drop non-gene rows
 54 |     # clean the names and set as the index (get row wrt name c.f. idx)
 55 |     gene_anno["Gene_Name"] = gene_anno["FullName"].str.extract(r"ID=(.*?);")
 56 |     gene_anno.set_index("Gene_Name", inplace=True)
 57 |     gene_anno = gene_anno.drop(columns=["FullName", "Software"])
 58 |     gene_anno.Strand = gene_anno.Strand.astype(str)
 59 |     gene_anno["Length"] = gene_anno.Stop - gene_anno.Start + 1
 60 |     return gene_anno
 61 | 
 62 | 
 63 | @pytest.fixture
 64 | def import_as_float64():
 65 |     gene_anno = pd.read_csv(
 66 |         gene_anno_path,
 67 |         sep="\t+",
 68 |         header=None,
 69 |         engine="python",
 70 |         names=col_names,
 71 |         usecols=col_to_use,
 72 |         dtype={"Start": "float64", "Stop": "float64"},
 73 |         comment="#",
 74 |     )
 75 |     gene_anno = gene_anno[gene_anno.Feature == "gene"]  # drop non-gene rows
 76 |     # clean the names and set as the index (get row wrt name c.f. idx)
 77 |     gene_anno["Gene_Name"] = gene_anno["FullName"].str.extract(r"ID=(.*?);")
 78 |     gene_anno.set_index("Gene_Name", inplace=True)
 79 |     gene_anno = gene_anno.drop(columns=["FullName", "Software"])
 80 |     gene_anno.Strand = gene_anno.Strand.astype(str)
 81 |     gene_anno["Length"] = gene_anno.Stop - gene_anno.Start + 1
 82 |     return gene_anno
 83 | 
 84 | 
 85 | true_start_list = [
 86 |     41.0,
 87 |     5556.0,
 88 |     8487.0,
 89 |     9361.0,
 90 |     11127.0,
 91 |     84598.0,
 92 |     117287120.0,
 93 |     118974314397.0,
 94 |     22456307315831.0,
 95 |     88877765432319026.0,
 96 | ]
 97 | true_stop_list = 2
 98 | 
 99 | 
100 | def test_small_numbers_to_float32(import_as_float32):
101 |     """
102 |     Using float64 the import script produce the right start and stop values when
103 |     importing a gene annotation with small initial values?
104 |     """
105 |     assert import_as_float32.Start.to_list()[0:5] == true_start_list[0:5]
106 | 
107 | 
108 | def test_large_numbers_to_float32(import_as_float32):
109 |     """
110 |     Using float64 the import script produce the right start and stop values when
111 |     importing a gene annotation with large initial values?
112 |     """
113 |     with pytest.raises(AssertionError):
114 |         assert import_as_float32.Start.to_list()[4:] == true_start_list[4:]
115 | 
116 | 
117 | def test_large_numbers_to_float64(import_as_float64):
118 |     """
119 |     Using float64 the import script produce the right start and stop values when
120 |     importing a gene annotation with large initial values?
121 |     """
122 |     assert import_as_float64.Start.to_list()[4:] == true_start_list[4:]
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     pytest.main(["-svv", __file__])  # for convenience
127 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/fix_cds_names.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Reformat CDS Fasta files for EDTA usage
  5 | """
  6 | 
  7 | __author__ = "Scott Teresi"
  8 | 
  9 | import argparse
 10 | import os
 11 | import logging
 12 | import coloredlogs
 13 | from Bio import SeqIO
 14 | 
 15 | 
 16 | def reformat_seq_iq(input_fasta, genome_name, output_dir, logger):
 17 |     """
 18 |     Reformat a CDS FASTA file to have shorter sequence ID names for EDTA
 19 | 
 20 |     Args:
 21 |         input_fasta (str): String path to input fasta file
 22 | 
 23 |         genome_name (str): String for genome name
 24 | 
 25 |         output_dir (str): Path to output dir
 26 | 
 27 |         logger (logging.Logger): Object to log information to
 28 | 
 29 |     Returns:
 30 |         None, just saves the edited FASTA file to disk. Also writes a
 31 |         conversion table to disk for the old names and their new name
 32 |         counterparts
 33 |     """
 34 |     # MAGIC file suffixes
 35 |     new_fasta = os.path.join(output_dir, (genome_name + "_CDS_NewNames.fasta"))
 36 |     name_key = os.path.join(output_dir, (genome_name + "_CDS_Seq_ID_Conversion.txt"))
 37 | 
 38 |     if os.path.exists(new_fasta):
 39 |         os.remove(new_fasta)  # remove the file because we are in append mode
 40 |     if os.path.exists(name_key):
 41 |         os.remove(name_key)
 42 |     pair_dict = {}  # NB this is used to write the conversion key later for
 43 |     # clarity
 44 |     with open(input_fasta, "r") as input_fasta:
 45 |         for s_record in SeqIO.parse(input_fasta, "fasta"):
 46 |             # NB the s_record.id and s_record.description combined contain
 47 |             # all the information for each entry following the '>' character
 48 |             # in the fasta
 49 | 
 50 |             # NB In this case:
 51 |             # We just want the s_record.id which correctly points to the
 52 |             # first integer. E.g '1 dna:chromosome blah blah blah' we just want
 53 |             # 1.
 54 |             s_record.id = s_record.id.replace("transcript:", "")
 55 |             pair_dict[s_record.id] = s_record.id + " " + s_record.description
 56 |             s_record.description = ""  # NB edit the description so that when
 57 |             # we rewrite we don't have the extraneous info
 58 |             with open(new_fasta, "a") as output:
 59 |                 SeqIO.write(s_record, output, "fasta")
 60 |     logger.info(
 61 |         "Finished writing new fasta to: %s" % os.path.join(output_dir, new_fasta)
 62 |     )
 63 | 
 64 |     with open(name_key, "w") as output:
 65 |         for key, val in pair_dict.items():
 66 |             # Write the conversion table for record-keeping.
 67 |             output.write("%s\t%s\n" % (key, val))
 68 |     logger.info(
 69 |         "Finished writing name conversion table to: %s"
 70 |         % os.path.join(output_dir, name_key)
 71 |     )
 72 | 
 73 | 
 74 | if __name__ == "__main__":
 75 | 
 76 |     path_main = os.path.abspath(__file__)
 77 |     dir_main = os.path.dirname(path_main)
 78 |     parser = argparse.ArgumentParser(description="Reformat FASTA for EDTA")
 79 | 
 80 |     parser.add_argument("fasta_input_file", type=str, help="parent path of fasta file")
 81 |     parser.add_argument("genome_id", type=str, help="name of genome")
 82 |     output_default = os.path.join(
 83 |         dir_main, "../../../../", "TE_Density_Example_Data/Rice"
 84 |     )
 85 |     parser.add_argument(
 86 |         "--output_dir",
 87 |         "-o",
 88 |         type=str,
 89 |         default=output_default,
 90 |         help="Parent directory to output results",
 91 |     )
 92 |     parser.add_argument(
 93 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
 94 |     )
 95 |     args = parser.parse_args()
 96 |     args.fasta_input_file = os.path.abspath(args.fasta_input_file)
 97 |     args.output_dir = os.path.join(args.output_dir, args.genome_id, "Sequences")
 98 | 
 99 |     log_level = logging.DEBUG if args.verbose else logging.INFO
100 |     logger = logging.getLogger(__name__)
101 |     coloredlogs.install(level=log_level)
102 | 
103 |     reformat_seq_iq(args.fasta_input_file, args.genome_id, args.output_dir, logger)
104 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/replace_names_rice.py:
--------------------------------------------------------------------------------
  1 | def te_annot_renamer(TE_Data):
  2 |     U = "Unknown_Order"
  3 |     master_order = {
  4 |         "Unknown": U,
  5 |         "MITE": "TIR",
  6 |         "pararetrovirus": "pararetrovirus",
  7 |         "DNA": "TIR",
  8 |     }
  9 | 
 10 |     U = "Unknown_Superfam"
 11 |     master_superfamily = {
 12 |         # EDTA/Wicker et al 2007 renames to common name:
 13 |         "RLC": "Copia",
 14 |         "RLG": "Gypsy",
 15 |         "RLB": "Bel_Pao",
 16 |         "RLR": "Retrovirus",
 17 |         "RLE": "ERV",
 18 |         "RYD": "DIRS",
 19 |         "RYN": "Ngaro",
 20 |         "RYV": "VIPER",
 21 |         "RPP": "Penelope",
 22 |         "RIR": "R2",
 23 |         "RIT": "RTE",
 24 |         "RIJ": "Jockey",
 25 |         "RIL": "L1",
 26 |         "RII": "I",
 27 |         "RST": "tRNA",
 28 |         "RSL": "7SL",
 29 |         "RSS": "5S",
 30 |         "DTT": "Tc1-Mariner",
 31 |         "DTA": "hAT",
 32 |         "DTM": "Mutator",
 33 |         "DTE": "Merlin",
 34 |         "DTR": "Transib",
 35 |         "DTP": "P",
 36 |         "DTB": "PiggyBac",
 37 |         "DTH": "PIF-Harbinger",
 38 |         "DTC": "CACTA",
 39 |         "DYC": "Crypton",
 40 |         "DHH": "Helitron",
 41 |         "DMM": "Maverick",
 42 |         # Custom changes
 43 |         "unknown": U,
 44 |         "Unknown": U,
 45 |         "None": U,
 46 |         "EnSpm_CACTA": "CACTA",
 47 |         "MuDR_Mutator": "Mutator",
 48 |         "PIF_Harbinger": "PIF-Harbinger",
 49 |     }
 50 | 
 51 |     TE_Data.SuperFamily.fillna(
 52 |         value="Unknown_Superfam", inplace=True
 53 |     )  # replace None w U
 54 | 
 55 |     # Invoke dictionary to fix names
 56 |     TE_Data.Order.replace(master_order, inplace=True)
 57 |     TE_Data.SuperFamily.replace(master_superfamily, inplace=True)
 58 | 
 59 |     # Rename the superfamily value for pararetros as pararetrovirus
 60 |     TE_Data.loc[TE_Data.Order == "pararetrovirus", "SuperFamily"] = "pararetrovirus"
 61 | 
 62 |     # Rename unknown LINE element superfamilies to Unknown_LINE_Superfam to
 63 |     # distinguish between other unknowns
 64 |     TE_Data.loc[
 65 |         (TE_Data.Order == "LINE") & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 66 |         "SuperFamily",
 67 |     ] = "Unknown_LINE_Superfam"
 68 | 
 69 |     # Rename unknown LTR element superfamilies to Unknown_LTR_Superfam to
 70 |     # distinguish between other unknowns
 71 |     TE_Data.loc[
 72 |         (TE_Data["Order"] == "LTR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 73 |         "SuperFamily",
 74 |     ] = "Unknown_LTR_Superfam"
 75 | 
 76 |     # Rename unknown TIR element superfamilies to Unknown_TIR_Superfam to
 77 |     # distinguish between other unknowns
 78 |     TE_Data.loc[
 79 |         (TE_Data.Order == "TIR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 80 |         "SuperFamily",
 81 |     ] = "Unknown_TIR_Superfam"
 82 | 
 83 |     # Rename both values for Helitron elements, so that 'Helitron' is
 84 |     # both the Order and SuperFamily value
 85 |     # Some Helitron elements were labeled 'DNA' in the Order location, this is
 86 |     # technically correct but I prefer to differentiate the TIR DNA elements
 87 |     # from DNA elements as a whole
 88 |     TE_Data.loc[
 89 |         (TE_Data["Order"] == "TIR") & (TE_Data["SuperFamily"] == "Helitron"),
 90 |         ["Order", "SuperFamily"],
 91 |     ] = "Helitron"
 92 |     # If the Order is Helitron and the SuperFamily is unknown make the
 93 |     # superfamily 'Helitron'
 94 |     TE_Data.loc[
 95 |         (TE_Data["Order"] == "Helitron")
 96 |         & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 97 |         "SuperFamily",
 98 |     ] = "Helitron"
 99 | 
100 |     # For TEs that are unknown for both Order AND SuperFamily we will call
101 |     # those 'Completely_Unknown'
102 |     TE_Data.loc[
103 |         (TE_Data["Order"] == "Unknown_Order")
104 |         & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
105 |         ["Order", "SuperFamily"],
106 |     ] = "Completely_Unknown"
107 | 
108 |     return TE_Data
109 | 
110 | 
111 | def diagnostic_cleaner_helper(TE_Data):
112 |     print()
113 |     print(TE_Data.Order.unique())
114 |     print(TE_Data.SuperFamily.unique())
115 |     print()
116 | 
117 |     # To see unique for a given type:
118 |     # print(TE_Data.loc[TE_Data['Order'] == 'LINE'].SuperFamily.unique())
119 |     return None
120 | 


--------------------------------------------------------------------------------
/examples/Arabidopsis/src/replace_names_Arabidopsis.py:
--------------------------------------------------------------------------------
  1 | def te_annot_renamer(TE_Data):
  2 |     U = "Unknown_Order"
  3 |     master_order = {
  4 |         "Unknown": U,
  5 |         "MITE": "TIR",
  6 |         "pararetrovirus": "pararetrovirus",
  7 |         "DNA": "TIR",
  8 |     }
  9 | 
 10 |     U = "Unknown_Superfam"
 11 |     master_superfamily = {
 12 |         # EDTA/Wicker et al 2007 renames to common name:
 13 |         "RLC": "Copia",
 14 |         "RLG": "Gypsy",
 15 |         "RLB": "Bel_Pao",
 16 |         "RLR": "Retrovirus",
 17 |         "RLE": "ERV",
 18 |         "RYD": "DIRS",
 19 |         "RYN": "Ngaro",
 20 |         "RYV": "VIPER",
 21 |         "RPP": "Penelope",
 22 |         "RIR": "R2",
 23 |         "RIT": "RTE",
 24 |         "RIJ": "Jockey",
 25 |         "RIL": "L1",
 26 |         "RII": "I",
 27 |         "RST": "tRNA",
 28 |         "RSL": "7SL",
 29 |         "RSS": "5S",
 30 |         "DTT": "Tc1-Mariner",
 31 |         "DTA": "hAT",
 32 |         "DTM": "Mutator",
 33 |         "DTE": "Merlin",
 34 |         "DTR": "Transib",
 35 |         "DTP": "P",
 36 |         "DTB": "PiggyBac",
 37 |         "DTH": "PIF-Harbinger",
 38 |         "DTC": "CACTA",
 39 |         "DYC": "Crypton",
 40 |         "DHH": "Helitron",
 41 |         "DMM": "Maverick",
 42 |         # Custom changes
 43 |         "unknown": U,
 44 |         "Unknown": U,
 45 |         "None": U,
 46 |         "EnSpm_CACTA": "CACTA",
 47 |         "MuDR_Mutator": "Mutator",
 48 |         "PIF_Harbinger": "PIF-Harbinger",
 49 |     }
 50 | 
 51 |     TE_Data.SuperFamily.fillna(
 52 |         value="Unknown_Superfam", inplace=True
 53 |     )  # replace None w U
 54 | 
 55 |     # Invoke dictionary to fix names
 56 |     TE_Data.Order.replace(master_order, inplace=True)
 57 |     TE_Data.SuperFamily.replace(master_superfamily, inplace=True)
 58 | 
 59 |     # Rename the superfamily value for pararetros as pararetrovirus
 60 |     TE_Data.loc[TE_Data.Order == "pararetrovirus", "SuperFamily"] = "pararetrovirus"
 61 | 
 62 |     # Rename unknown LINE element superfamilies to Unknown_LINE_Superfam to
 63 |     # distinguish between other unknowns
 64 |     TE_Data.loc[
 65 |         (TE_Data.Order == "LINE") & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 66 |         "SuperFamily",
 67 |     ] = "Unknown_LINE_Superfam"
 68 | 
 69 |     # Rename unknown LTR element superfamilies to Unknown_LTR_Superfam to
 70 |     # distinguish between other unknowns
 71 |     TE_Data.loc[
 72 |         (TE_Data["Order"] == "LTR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 73 |         "SuperFamily",
 74 |     ] = "Unknown_LTR_Superfam"
 75 | 
 76 |     # Rename unknown TIR element superfamilies to Unknown_TIR_Superfam to
 77 |     # distinguish between other unknowns
 78 |     TE_Data.loc[
 79 |         (TE_Data.Order == "TIR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 80 |         "SuperFamily",
 81 |     ] = "Unknown_TIR_Superfam"
 82 | 
 83 |     # Rename both values for Helitron elements, so that 'Helitron' is
 84 |     # both the Order and SuperFamily value
 85 |     # Some Helitron elements were labeled 'DNA' in the Order location, this is
 86 |     # technically correct but I prefer to differentiate the TIR DNA elements
 87 |     # from DNA elements as a whole
 88 |     TE_Data.loc[
 89 |         (TE_Data["Order"] == "TIR") & (TE_Data["SuperFamily"] == "Helitron"),
 90 |         ["Order", "SuperFamily"],
 91 |     ] = "Helitron"
 92 |     # If the Order is Helitron and the SuperFamily is unknown make the
 93 |     # superfamily 'Helitron'
 94 |     TE_Data.loc[
 95 |         (TE_Data["Order"] == "Helitron")
 96 |         & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 97 |         "SuperFamily",
 98 |     ] = "Helitron"
 99 | 
100 |     # For TEs that are unknown for both Order AND SuperFamily we will call
101 |     # those 'Completely_Unknown'
102 |     TE_Data.loc[
103 |         (TE_Data["Order"] == "Unknown_Order")
104 |         & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
105 |         ["Order", "SuperFamily"],
106 |     ] = "Completely_Unknown"
107 | 
108 |     return TE_Data
109 | 
110 | 
111 | def diagnostic_cleaner_helper(TE_Data):
112 |     print()
113 |     print(TE_Data.Order.unique())
114 |     print(TE_Data.SuperFamily.unique())
115 |     print()
116 | 
117 |     # To see unique for a given type:
118 |     # print(TE_Data.loc[TE_Data['Order'] == 'LINE'].SuperFamily.unique())
119 |     return None
120 | 


--------------------------------------------------------------------------------
/examples/Blueberry_Expression/src/replace_names_blueberry.py:
--------------------------------------------------------------------------------
  1 | def te_annot_renamer(TE_Data):
  2 |     U = "Unknown_Order"
  3 |     master_order = {
  4 |         "Unknown": U,
  5 |         "MITE": "TIR",
  6 |         "pararetrovirus": "pararetrovirus",
  7 |         "DNA": "TIR",
  8 |     }
  9 | 
 10 |     U = "Unknown_Superfam"
 11 |     master_superfamily = {
 12 |         # EDTA/Wicker et al 2007 renames to common name:
 13 |         "RLC": "Copia",
 14 |         "RLG": "Gypsy",
 15 |         "RLB": "Bel_Pao",
 16 |         "RLR": "Retrovirus",
 17 |         "RLE": "ERV",
 18 |         "RYD": "DIRS",
 19 |         "RYN": "Ngaro",
 20 |         "RYV": "VIPER",
 21 |         "RPP": "Penelope",
 22 |         "RIR": "R2",
 23 |         "RIT": "RTE",
 24 |         "RIJ": "Jockey",
 25 |         "RIL": "L1",
 26 |         "RII": "I",
 27 |         "RST": "tRNA",
 28 |         "RSL": "7SL",
 29 |         "RSS": "5S",
 30 |         "DTT": "Tc1-Mariner",
 31 |         "DTA": "hAT",
 32 |         "DTM": "Mutator",
 33 |         "DTE": "Merlin",
 34 |         "DTR": "Transib",
 35 |         "DTP": "P",
 36 |         "DTB": "PiggyBac",
 37 |         "DTH": "PIF-Harbinger",
 38 |         "DTC": "CACTA",
 39 |         "DYC": "Crypton",
 40 |         "DHH": "Helitron",
 41 |         "DMM": "Maverick",
 42 |         # Custom changes
 43 |         "unknown": U,
 44 |         "Unknown": U,
 45 |         "None": U,
 46 |         "EnSpm_CACTA": "CACTA",
 47 |         "MuDR_Mutator": "Mutator",
 48 |         "PIF_Harbinger": "PIF-Harbinger",
 49 |     }
 50 | 
 51 |     TE_Data.SuperFamily.fillna(
 52 |         value="Unknown_Superfam", inplace=True
 53 |     )  # replace None w U
 54 | 
 55 |     # Invoke dictionary to fix names
 56 |     TE_Data.Order.replace(master_order, inplace=True)
 57 |     TE_Data.SuperFamily.replace(master_superfamily, inplace=True)
 58 | 
 59 |     # Rename the superfamily value for pararetros as pararetrovirus
 60 |     TE_Data.loc[TE_Data.Order == "pararetrovirus", "SuperFamily"] = "pararetrovirus"
 61 | 
 62 |     # Rename unknown LINE element superfamilies to Unknown_LINE_Superfam to
 63 |     # distinguish between other unknowns
 64 |     TE_Data.loc[
 65 |         (TE_Data.Order == "LINE") & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 66 |         "SuperFamily",
 67 |     ] = "Unknown_LINE_Superfam"
 68 | 
 69 |     # Rename unknown LTR element superfamilies to Unknown_LTR_Superfam to
 70 |     # distinguish between other unknowns
 71 |     TE_Data.loc[
 72 |         (TE_Data["Order"] == "LTR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 73 |         "SuperFamily",
 74 |     ] = "Unknown_LTR_Superfam"
 75 | 
 76 |     # Rename unknown TIR element superfamilies to Unknown_TIR_Superfam to
 77 |     # distinguish between other unknowns
 78 |     TE_Data.loc[
 79 |         (TE_Data.Order == "TIR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 80 |         "SuperFamily",
 81 |     ] = "Unknown_TIR_Superfam"
 82 | 
 83 |     # Rename both values for Helitron elements, so that 'Helitron' is
 84 |     # both the Order and SuperFamily value
 85 |     # Some Helitron elements were labeled 'DNA' in the Order location, this is
 86 |     # technically correct but I prefer to differentiate the TIR DNA elements
 87 |     # from DNA elements as a whole
 88 |     TE_Data.loc[
 89 |         (TE_Data["Order"] == "TIR") & (TE_Data["SuperFamily"] == "Helitron"),
 90 |         ["Order", "SuperFamily"],
 91 |     ] = "Helitron"
 92 |     # If the Order is Helitron and the SuperFamily is unknown make the
 93 |     # superfamily 'Helitron'
 94 |     TE_Data.loc[
 95 |         (TE_Data["Order"] == "Helitron")
 96 |         & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
 97 |         "SuperFamily",
 98 |     ] = "Helitron"
 99 | 
100 |     # For TEs that are unknown for both Order AND SuperFamily we will call
101 |     # those 'Completely_Unknown'
102 |     TE_Data.loc[
103 |         (TE_Data["Order"] == "Unknown_Order")
104 |         & (TE_Data["SuperFamily"] == "Unknown_Superfam"),
105 |         ["Order", "SuperFamily"],
106 |     ] = "Completely_Unknown"
107 | 
108 |     return TE_Data
109 | 
110 | 
111 | def diagnostic_cleaner_helper(TE_Data):
112 |     print()
113 |     print(TE_Data.Order.unique())
114 |     print(TE_Data.SuperFamily.unique())
115 |     print()
116 | 
117 |     # To see unique for a given type:
118 |     # print(TE_Data.loc[TE_Data['Order'] == 'LINE'].SuperFamily.unique())
119 |     return None
120 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/import_syntelogs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | __author__ = "Scott Teresi"
  4 | 
  5 | import pandas as pd
  6 | 
  7 | 
  8 | def import_syntelogs(syntelog_input_file):
  9 |     """
 10 |     Import the syntelogs from the raw file and manage data filtration
 11 |     """
 12 | 
 13 |     col_names = [
 14 |         "OrgA_Chromosome",
 15 |         "OrgA_Gene_Region",
 16 |         "OrgA_Start",
 17 |         "OrgA_Stop",
 18 |         "OrgB_Chromosome",
 19 |         "OrgB_Gene_Region",
 20 |         "OrgB_Start",
 21 |         "OrgB_Stop",
 22 |         "E_Value",
 23 |         "Diagonal_Score",
 24 |         "Web_Link",
 25 |     ]
 26 | 
 27 |     col_to_use = [
 28 |         "OrgA_Chromosome",
 29 |         "OrgA_Gene_Region",
 30 |         "OrgB_Chromosome",
 31 |         "OrgB_Gene_Region",
 32 |         "E_Value",
 33 |         "Diagonal_Score",
 34 |     ]
 35 | 
 36 |     syntelog_pandaframe = pd.read_csv(
 37 |         syntelog_input_file,
 38 |         sep="\t+",
 39 |         header=None,
 40 |         engine="python",
 41 |         names=col_names,
 42 |         usecols=col_to_use,
 43 |         comment="#",
 44 |         dtype={
 45 |             "OrgA_Chromosome": str,
 46 |             "OrgA_Gene_Region": str,
 47 |             "OrgB_Chromosome": str,
 48 |             "OrgB_Gene_Region": str,
 49 |             "E_Value": "float64",
 50 |             "Diagonal_Score": "int32",
 51 |         },
 52 |     )
 53 | 
 54 |     # Get the correct name for the genes
 55 |     # MAGIC to split the name correctly
 56 |     syntelog_pandaframe["OrgA_Gene_Region"] = (
 57 |         syntelog_pandaframe["OrgA_Gene_Region"].str.split("\|\|").str[3]
 58 |     )
 59 |     syntelog_pandaframe["OrgB_Gene_Region"] = (
 60 |         syntelog_pandaframe["OrgB_Gene_Region"].str.split("\|\|").str[3]
 61 |     )
 62 | 
 63 |     # Remove rows that have transcript in the name because not worth dealing
 64 |     # with for example
 65 |     syntelog_pandaframe = syntelog_pandaframe[
 66 |         ~syntelog_pandaframe["OrgA_Gene_Region"].str.contains("transcript")
 67 |     ]
 68 |     syntelog_pandaframe = syntelog_pandaframe[
 69 |         ~syntelog_pandaframe["OrgB_Gene_Region"].str.contains("transcript")
 70 |     ]
 71 | 
 72 |     # Get the correct name for the gene names
 73 |     # MAGIC to split the name correctly
 74 |     syntelog_pandaframe["OrgA_Gene_Region"] = (
 75 |         syntelog_pandaframe["OrgA_Gene_Region"].str.split("CDS:").str[1]
 76 |     )
 77 |     syntelog_pandaframe["OrgB_Gene_Region"] = (
 78 |         syntelog_pandaframe["OrgB_Gene_Region"].str.split("CDS:").str[1]
 79 |     )
 80 | 
 81 |     syntelog_pandaframe["OrgA_Gene_Region"] = (
 82 |         syntelog_pandaframe["OrgA_Gene_Region"].str.split(".").str[0]
 83 |     )
 84 |     syntelog_pandaframe["OrgB_Gene_Region"] = (
 85 |         syntelog_pandaframe["OrgB_Gene_Region"].str.split("-").str[0]
 86 |     )
 87 | 
 88 |     # SynMap returns the transcript name for Sativa which can have slight
 89 |     # differences with the gene name, namely the letter g is replaced with
 90 |     # the letter t.
 91 |     # NB fix to get the gene name
 92 |     syntelog_pandaframe["OrgB_Gene_Region"] = syntelog_pandaframe[
 93 |         "OrgB_Gene_Region"
 94 |     ].str.replace("t", "g")
 95 | 
 96 |     # Get the correct name for the chromosome
 97 |     # MAGIC
 98 |     syntelog_pandaframe["OrgA_Chromosome"] = (
 99 |         syntelog_pandaframe["OrgA_Chromosome"].str.split("_").str[1]
100 |     )
101 |     syntelog_pandaframe["OrgB_Chromosome"] = (
102 |         syntelog_pandaframe["OrgB_Chromosome"].str.split("_").str[1]
103 |     )
104 | 
105 |     # This step is important, it could differ if your data input is different.
106 |     syntelog_pandaframe.rename(
107 |         columns={"OrgA_Gene_Region": "Glaberrima", "OrgB_Gene_Region": "Sativa"},
108 |         inplace=True,
109 |     )
110 |     # Trim E-values less than 0.05
111 |     # MAGIC
112 |     syntelog_pandaframe = syntelog_pandaframe.loc[syntelog_pandaframe["E_Value"] < 0.05]
113 | 
114 |     syntelog_pandaframe.drop(
115 |         columns=["Diagonal_Score"],
116 |         inplace=True,
117 |     )
118 | 
119 |     # I only want pairs where the chromosomes are equal
120 |     syntelog_pandaframe = syntelog_pandaframe.loc[
121 |         syntelog_pandaframe["OrgA_Chromosome"] == syntelog_pandaframe["OrgB_Chromosome"]
122 |     ]
123 | 
124 |     chromosome_list = [str(i) for i in range(1, 12 + 1)]
125 |     syntelog_pandaframe = syntelog_pandaframe.loc[
126 |         syntelog_pandaframe["OrgA_Chromosome"].isin(chromosome_list)
127 |     ]
128 | 
129 |     return syntelog_pandaframe
130 | 


--------------------------------------------------------------------------------
/examples/Human/src/retrieve_info_of_genes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env/python
  2 | 
  3 | """
  4 | Retrieve info of a list of genes
  5 | """
  6 | 
  7 | __author__ = "Scott Teresi"
  8 | 
  9 | import argparse
 10 | import os
 11 | import logging
 12 | import coloredlogs
 13 | 
 14 | from transposon.gene_data import GeneData
 15 | from transposon.density_data import DensityData
 16 | from transposon.import_filtered_genes import import_filtered_genes
 17 | 
 18 | 
 19 | if __name__ == "__main__":
 20 |     path_main = os.path.abspath(__file__)
 21 |     dir_main = os.path.dirname(path_main)
 22 |     parser = argparse.ArgumentParser(
 23 |         description="""output to std_out information of each gene in a
 24 |         user-provided list of genes using the DensityData.info_of_gene()
 25 |         function"""
 26 |     )
 27 | 
 28 |     parser.add_argument(
 29 |         "te_density_hdf5_result",
 30 |         type=str,
 31 |         help="""Path to the HDF5 file that contains the genes that the user wants
 32 |         to extract TE density data from""",
 33 |     )
 34 | 
 35 |     parser.add_argument(
 36 |         "cleaned_gene_data_file",
 37 |         type=str,
 38 |         help="""Path to the cleaned gene data
 39 |         file that was produced prior to running the pipeline, this is necessary
 40 |         to initialize the DensityData obj""",
 41 |     )
 42 | 
 43 |     parser.add_argument(
 44 |         "chromosome_to_subset_gene_data",
 45 |         type=str,
 46 |         help="""The cleaned gene data
 47 |         file may contain information of genes from multiple chromosomes,
 48 |         because the TE Density data corresponds to one chromosome, please
 49 |         specify the appropriate chromosome identifier for the density data you
 50 |         are trying to access, so that we may appropriately subset the gene
 51 |         data""",
 52 |     )
 53 | 
 54 |     parser.add_argument(
 55 |         "genome_id",
 56 |         type=str,
 57 |         help="""Please specify the genome ID, use the same one you used as an
 58 |         argument when running TE Density""",
 59 |     )
 60 | 
 61 |     parser.add_argument(
 62 |         "list_of_genes",
 63 |         type=str,
 64 |         help="Path to list of genes files",
 65 |     )
 66 | 
 67 |     parser.add_argument(
 68 |         "--window_idx",
 69 |         type=int,
 70 |         default=1,
 71 |         help="""Index of the window that you want to access information from,
 72 |         windows go from lowest to highest, default TE density settings yield 20
 73 |         windows (500, 10000, 500) (start, stop, step). Stop is inclusive""",
 74 |     )
 75 | 
 76 |     parser.add_argument(
 77 |         "--n_te_types",
 78 |         type=int,
 79 |         default=5,
 80 |         help="""Number of TE types that you want to display when showing the
 81 |         top and bottom TE categories for density relative to your gene of
 82 |         interest""",
 83 |     )
 84 | 
 85 |     parser.add_argument(
 86 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
 87 |     )
 88 | 
 89 |     args = parser.parse_args()
 90 |     args.te_density_hdf5_result = os.path.abspath(args.te_density_hdf5_result)
 91 |     args.cleaned_gene_data_file = os.path.abspath(args.cleaned_gene_data_file)
 92 |     args.list_of_genes = os.path.abspath(args.list_of_genes)
 93 |     log_level = logging.DEBUG if args.verbose else logging.INFO
 94 |     logger = logging.getLogger(__name__)
 95 |     coloredlogs.install(level=log_level)
 96 | 
 97 |     # Read pandas dataframe from cleaned genes file
 98 |     full_genome_gene_data = import_filtered_genes(args.cleaned_gene_data_file, logger)
 99 | 
100 |     # MAGIC, chromosome category is inherent to the pandas data frame
101 |     full_genome_gene_data_subsetted = full_genome_gene_data.loc[
102 |         full_genome_gene_data["Chromosome"] == args.chromosome_to_subset_gene_data
103 |     ]
104 | 
105 |     # Initialize GeneData obj
106 |     specific_chromosome_gene_data = GeneData(
107 |         full_genome_gene_data_subsetted, args.genome_id
108 |     )
109 | 
110 |     # Initialize DensityData obj
111 |     processed_density_data = DensityData.verify_h5_cache(
112 |         args.te_density_hdf5_result, specific_chromosome_gene_data, logger
113 |     )
114 | 
115 |     # Read list of genes file from user and begin printing output to console
116 |     with open(args.list_of_genes, "r", encoding="utf-8") as in_file:
117 |         all_genes = [gene.strip() for gene in in_file]
118 |     for gene in all_genes:
119 |         print(
120 |             processed_density_data.info_of_gene(gene, args.window_idx, args.n_te_types)
121 |         )
122 | 


--------------------------------------------------------------------------------
/transposon/replace_names.py:
--------------------------------------------------------------------------------
  1 | # TODO candidate for deletion
  2 | 
  3 | 
  4 | def te_annot_renamer(TE_Data):
  5 |     U = "Unknown_Order"
  6 |     master_order = {
  7 |         # Custom changes
  8 |         ## RepeatMasker-based Changes
  9 |         "unknown": U,
 10 |         "Unknown": U,
 11 |         "MITE": "DNA",
 12 |         "RC?": "DNA",
 13 |         "RC": "DNA",
 14 |         "SINE?": U,
 15 |         "tandem": "Tandem",
 16 |         "No_hits": U,
 17 |         ## EDTA-based Changes
 18 |         "pararetrovirus": "LTR",
 19 |         "mixture": "Mixture",
 20 |         "DNA": "TIR",
 21 |     }
 22 | 
 23 |     U = "Unknown_SuperFam"
 24 |     master_superfamily = {
 25 |         # EDTA/Wicker et al 2007 renames to common name:
 26 |         "RLC": "Copia",
 27 |         "RLG": "Gypsy",
 28 |         "RLB": "Bel_Pao",
 29 |         "RLR": "Retrovirus",
 30 |         "RLE": "ERV",
 31 |         "RYD": "DIRS",
 32 |         "RYN": "Ngaro",
 33 |         "RYV": "VIPER",
 34 |         "RPP": "Penelope",
 35 |         "RIR": "R2",
 36 |         "RIT": "RTE",
 37 |         "RIJ": "Jockey",
 38 |         "RIL": "L1",
 39 |         "RII": "I",
 40 |         "RST": "tRNA",
 41 |         "RSL": "7SL",
 42 |         "RSS": "5S",
 43 |         "DTT": "Tc1_Mariner",
 44 |         "DTA": "hAT",
 45 |         "DTM": "Mutator",
 46 |         "DTE": "Merlin",
 47 |         "DTR": "Transib",
 48 |         "DTP": "P",
 49 |         "DTB": "PiggyBac",
 50 |         "DTH": "PIF_Harbinger",
 51 |         "DTC": "CACTA",
 52 |         "DYC": "Crypton",
 53 |         "DHH": "Helitron",
 54 |         "DMM": "Maverick",
 55 |         # Custom changes
 56 |         "Uknown": U,
 57 |         "unknown": U,
 58 |         "Unknown": U,
 59 |         "EnSpm_CACTA": "CACTA"
 60 |         #'MuDr': 'MULE',
 61 |         #'MULE-MuDR': 'MULE',
 62 |         #'Mutator|cleanup': 'MULE',
 63 |         #'TcMar': U,
 64 |         #'Pao': U,
 65 |         #'Caulimovirus': U,
 66 |         #'hAT-Tag1': 'hAT',
 67 |         #'hAT-Tip100': 'hAT',
 68 |         #'hAT-Charlie': 'hAT',
 69 |         #'Helitron': U,
 70 |         #'Maverick': U,
 71 |         #'Harbinger': 'PIF_Harbinger',
 72 |         #'PIF-Harbinger': 'PIF_Harbinger',
 73 |         #'TcMar-Pogo': U,
 74 |         #'CR1': 'LINE',
 75 |         #'hAT-Ac': 'hAT',
 76 |         #'L2': 'LINE',
 77 |         #'L1': 'LINE',
 78 |         #'Jockey': 'LINE',
 79 |         #'MuLE-MuDR': 'MULE',
 80 |         #'MuDR': 'MULE',
 81 |         #'Mutator': 'MULE',
 82 |         #'Micro_like': U,
 83 |         #'Micro-like-sequence': U,
 84 |         #'Micro-like-sequence|cleanup': U,
 85 |         #'Unclassified': U,
 86 |         #'L1-Tx1': 'LINE',
 87 |         #'CRE': 'LINE',
 88 |         #'CACTA': 'CMC-EnSpm',
 89 |         #'Tad1': U,
 90 |         #'hAT|cleanup': 'hAT',
 91 |         #'': U,
 92 |         #'Line': 'LINE'
 93 |     }
 94 |     TE_Data.SuperFamily.fillna(
 95 |         value="Unknown_SuperFam", inplace=True
 96 |     )  # replace None w U
 97 |     # step to fix TE names
 98 |     TE_Data.Order.replace(master_order, inplace=True)
 99 |     TE_Data.SuperFamily.replace(master_superfamily, inplace=True)
100 |     TE_Data.loc[TE_Data.Order == "Tandem", "SuperFamily"] = "Tandem"
101 | 
102 |     to_drop = TE_Data.Chromosome.str.contains("##sequence-region")
103 |     TE_Data = TE_Data[~to_drop]
104 |     to_drop = TE_Data.Chromosome.str.contains("contig*")
105 |     TE_Data = TE_Data[~to_drop]
106 | 
107 |     TE_Data.loc[
108 |         (TE_Data["Order"] == "Unknown_Order")
109 |         & (TE_Data["SuperFamily"] == "Unknown_SuperFam"),
110 |         ["Order", "SuperFamily"],
111 |     ] = "Completely_Unknown"
112 | 
113 |     TE_Data.loc[
114 |         (TE_Data["Order"] == "Helitron")
115 |         & (TE_Data["SuperFamily"] == "Unknown_SuperFam"),
116 |         ["SuperFamily"],
117 |     ] = "Helitron"
118 |     TE_Data.loc[(TE_Data["Order"] == "Helitron"), ["Order"]] = "Helitron"
119 |     TE_Data.loc[(TE_Data["SuperFamily"] == "Helitron"), ["Order"]] = "Helitron"
120 |     TE_Data.loc[(TE_Data["Order"] == "Mixture"), ["SuperFamily"]] = "Mixture"
121 | 
122 |     ltr_elements = ["Copia", "Gypsy"]
123 |     TE_Data.loc[
124 |         (TE_Data["Order"] == "LTR") & (~TE_Data["SuperFamily"].isin(ltr_elements)),
125 |         ["SuperFamily"],
126 |     ] = "Unknown_LTR_Superfam"
127 |     TE_Data = TE_Data[TE_Data.Order != "Simple_repeat"]  # drop s repeat
128 |     TE_Data = TE_Data[TE_Data.Order != "long_terminal_repeat"]  # drop
129 |     TE_Data = TE_Data[TE_Data.Order != "Maverick"]  # drop if in Order category
130 |     TE_Data = TE_Data[
131 |         TE_Data.Order != "target_site_duplication"
132 |     ]  # drop if in Order category
133 |     return TE_Data
134 | 


--------------------------------------------------------------------------------
/examples/Blueberry_Expression/src/import_blueberry_EDTA.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Filter a EDTA-created TE annotation to the appropriate format for TE
  3 | Density algorithm
  4 | """
  5 | 
  6 | __author__ = "Scott Teresi"
  7 | 
  8 | import pandas as pd
  9 | import argparse
 10 | import os
 11 | import logging
 12 | import coloredlogs
 13 | 
 14 | from examples.Blueberry_Expression.src.replace_names_blueberry import te_annot_renamer
 15 | 
 16 | 
 17 | def check_nulls(my_df, logger):
 18 |     """Check the TE dataframe for ANY null values in ANY rows
 19 | 
 20 |     Args:
 21 |         my_df (pandas.core.DataFrame): Pandas dataframe of TE values from TE
 22 |             annotation
 23 |     """
 24 |     Bool = my_df.isnull().values.any()
 25 |     if Bool:
 26 |         logger.critical("You have null values in your dataframe!")
 27 |         logger.critical("Here are the null values in the output:")
 28 |         null_columns = my_df.columns[my_df.isnull().any()]
 29 |         print((my_df[my_df.isnull().any(axis=1)][null_columns].head()))
 30 | 
 31 | 
 32 | def write_cleaned_transposons(te_pandaframe, output_dir, genome_name, logger):
 33 |     file_name = os.path.join(output_dir, ("Cleaned_" + genome_name + "_EDTA_TEs.tsv"))
 34 | 
 35 |     logger.info("Writing cleaned TE file to: %s" % file_name)
 36 |     te_pandaframe.to_csv(file_name, sep="\t", header=True, index=False)
 37 | 
 38 | 
 39 | def import_transposons(tes_input_path, te_annot_renamer, logger):
 40 |     """Import TE file and read as a dataframe in Pandas
 41 | 
 42 |     Args:
 43 |         tes_input_path (str): string of the file path to the TE annotation
 44 | 
 45 |         logger (logging obj): The object to call logs and info
 46 |     """
 47 |     col_names = [
 48 |         "Chromosome",
 49 |         "Software",
 50 |         "Feature",
 51 |         "Start",
 52 |         "Stop",
 53 |         "Score",
 54 |         "Strand",
 55 |         "Phase",
 56 |         "Attribute",
 57 |     ]
 58 | 
 59 |     TE_Data = pd.read_csv(
 60 |         tes_input_path,
 61 |         sep="\t+",
 62 |         header=None,
 63 |         engine="python",
 64 |         names=col_names,
 65 |         comment="#",
 66 |         dtype={"Start": "float64", "Stop": "float64"},
 67 |     )
 68 | 
 69 |     # Drop extraneous columns
 70 |     TE_Data.drop(columns=["Score", "Software", "Phase", "Feature"], inplace=True)
 71 | 
 72 |     # Create Order and SuperFamily column from Attribute column
 73 |     # Because that column contains the detailed TE information
 74 |     # Then remove old Attribute column
 75 |     TE_Data["Attribute"] = TE_Data["Attribute"].str.extract(r"Classification=(.*?);")
 76 |     TE_Data[["Order", "SuperFamily"]] = TE_Data.Attribute.str.split("/", expand=True)
 77 |     TE_Data.drop(columns=["Attribute"], inplace=True)
 78 |     TE_Data.Order = TE_Data.Order.astype(str)
 79 |     TE_Data.SuperFamily = TE_Data.SuperFamily.astype(str)
 80 |     TE_Data.Strand = TE_Data.Strand.astype(str)
 81 | 
 82 |     # Rename because Blueberry scaffolds got renamed during EDTA
 83 |     TE_Data["Chromosome"] = "VaccDscaff" + TE_Data["Chromosome"].astype(str)
 84 | 
 85 |     # Call renamer
 86 |     TE_Data = te_annot_renamer(TE_Data)
 87 | 
 88 |     # Declare data types
 89 |     TE_Data["Length"] = TE_Data.Stop - TE_Data.Start + 1
 90 |     check_nulls(TE_Data, logger)
 91 | 
 92 |     TE_Data.sort_values(by=["Chromosome", "Start"], inplace=True)
 93 | 
 94 |     # MAGIC I only want the first 48 chromosomes
 95 |     chromosomes_i_want = ["VaccDscaff" + str(i) for i in range(49)]
 96 |     TE_Data = TE_Data.loc[TE_Data["Chromosome"].isin(chromosomes_i_want)]
 97 | 
 98 |     return TE_Data
 99 | 
100 | 
101 | if __name__ == "__main__":
102 | 
103 |     parser = argparse.ArgumentParser(description="Reformat TE annotation file")
104 |     parser.add_argument(
105 |         "TE_input_file", type=str, help="Parent path of TE annotation file"
106 |     )
107 | 
108 |     parser.add_argument(
109 |         "output_dir",
110 |         type=str,
111 |         help="Parent directory to output results",
112 |     )
113 | 
114 |     parser.add_argument(
115 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
116 |     )
117 | 
118 |     args = parser.parse_args()
119 |     args.TE_input_file = os.path.abspath(args.TE_input_file)
120 |     args.output_dir = os.path.abspath(args.output_dir)
121 | 
122 |     log_level = logging.DEBUG if args.verbose else logging.INFO
123 |     logger = logging.getLogger(__name__)
124 |     coloredlogs.install(level=log_level)
125 | 
126 |     # Execute
127 |     cleaned_transposons = import_transposons(
128 |         args.TE_input_file, te_annot_renamer, logger
129 |     )
130 |     write_cleaned_transposons(cleaned_transposons, args.output_dir, "Blueberry", logger)
131 | 


--------------------------------------------------------------------------------
/examples/Arabidopsis/src/import_Arabidopsis_EDTA.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Filter a EDTA-created TE annotation to the appropriate format for TE
  3 | Density algorithm
  4 | """
  5 | 
  6 | __author__ = "Scott Teresi"
  7 | 
  8 | import pandas as pd
  9 | import argparse
 10 | import os
 11 | import logging
 12 | import coloredlogs
 13 | 
 14 | from examples.Arabidopsis.src.replace_names_Arabidopsis import te_annot_renamer
 15 | 
 16 | 
 17 | def check_nulls(my_df, logger):
 18 |     """Check the TE dataframe for ANY null values in ANY rows
 19 | 
 20 |     Args:
 21 |         my_df (pandas.core.DataFrame): Pandas dataframe of TE values from TE
 22 |             annotation
 23 |     """
 24 |     Bool = my_df.isnull().values.any()
 25 |     if Bool:
 26 |         logger.critical("You have null values in your dataframe!")
 27 |         logger.critical("Here are the null values in the output:")
 28 |         null_columns = my_df.columns[my_df.isnull().any()]
 29 |         print((my_df[my_df.isnull().any(axis=1)][null_columns].head()))
 30 | 
 31 | 
 32 | def write_cleaned_transposons(te_pandaframe, output_dir, old_filename, logger):
 33 |     file_name = os.path.join(
 34 |         output_dir,
 35 |         ("Cleaned_" + os.path.splitext(os.path.basename(old_filename))[0]) + ".tsv",
 36 |     )  # MAGIC to get proper extension
 37 | 
 38 |     logger.info("Writing cleaned TE file to: %s" % file_name)
 39 |     te_pandaframe.to_csv(file_name, sep="\t", header=True, index=False)
 40 | 
 41 | 
 42 | def import_transposons(tes_input_path, te_annot_renamer, logger):
 43 |     """Import TE file and read as a dataframe in Pandas
 44 | 
 45 |     Args:
 46 |         tes_input_path (str): string of the file path to the TE annotation
 47 | 
 48 |         logger (logging obj): The object to call logs and info
 49 |     """
 50 |     col_names = [
 51 |         "Chromosome",
 52 |         "Software",
 53 |         "Feature",
 54 |         "Start",
 55 |         "Stop",
 56 |         "Score",
 57 |         "Strand",
 58 |         "Phase",
 59 |         "Attribute",
 60 |     ]
 61 | 
 62 |     te_data = pd.read_csv(
 63 |         tes_input_path,
 64 |         sep="\t+",
 65 |         header=None,
 66 |         engine="python",
 67 |         names=col_names,
 68 |         comment="#",
 69 |         dtype={"Start": "float64", "Stop": "float64"},
 70 |     )
 71 | 
 72 |     # Drop extraneous columns
 73 |     te_data.drop(columns=["Score", "Software", "Phase", "Feature"], inplace=True)
 74 | 
 75 |     # Create Order and SuperFamily column from Attribute column
 76 |     # Because that column contains the detailed TE information
 77 |     # Then remove old Attribute column
 78 |     te_data["Attribute"] = te_data["Attribute"].str.extract(r"Classification=(.*?);")
 79 |     te_data[["Order", "SuperFamily"]] = te_data.Attribute.str.split("/", expand=True)
 80 |     te_data.drop(columns=["Attribute"], inplace=True)
 81 |     te_data.Order = te_data.Order.astype(str)
 82 |     te_data.SuperFamily = te_data.SuperFamily.astype(str)
 83 |     te_data.Strand = te_data.Strand.astype(str)
 84 | 
 85 |     # Call renamer
 86 |     te_data = te_annot_renamer(te_data)
 87 | 
 88 |     # Declare data types
 89 |     te_data["Length"] = te_data.Stop - te_data.Start + 1
 90 |     check_nulls(te_data, logger)
 91 | 
 92 |     te_data.sort_values(by=["Chromosome", "Start"], inplace=True)
 93 | 
 94 |     return te_data
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 | 
 99 |     parser = argparse.ArgumentParser(description="Reformat TE annotation file")
100 |     path_main = os.path.abspath(__file__)
101 |     dir_main = os.path.dirname(path_main)
102 |     output_default = os.path.join(
103 |         dir_main, "../../../../", "TE_Data/filtered_input_data"
104 |     )
105 |     parser.add_argument(
106 |         "TE_input_file", type=str, help="Parent path of TE annotation file"
107 |     )
108 | 
109 |     parser.add_argument(
110 |         "--output_dir",
111 |         "-o",
112 |         type=str,
113 |         default=output_default,
114 |         help="Parent directory to output results",
115 |     )
116 | 
117 |     parser.add_argument(
118 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
119 |     )
120 | 
121 |     args = parser.parse_args()
122 |     args.TE_input_file = os.path.abspath(args.TE_input_file)
123 |     args.output_dir = os.path.abspath(args.output_dir)
124 | 
125 |     log_level = logging.DEBUG if args.verbose else logging.INFO
126 |     logger = logging.getLogger(__name__)
127 |     coloredlogs.install(level=log_level)
128 | 
129 |     # Execute
130 |     cleaned_transposons = import_transposons(
131 |         args.TE_input_file, te_annot_renamer, logger
132 |     )
133 |     write_cleaned_transposons(
134 |         cleaned_transposons, args.output_dir, args.TE_input_file, logger
135 |     )
136 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/Makefile:
--------------------------------------------------------------------------------
 1 | # scripts for running rice synteny TE Density examples
 2 | # __file__ Makefile
 3 | # __author__ Scott Teresi
 4 | 
 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 6 | DEV_GFF_READ_EXECUTABLE := /home/scott/Documents/Uni/Research/gffread
 7 | #DEV_GFF_READ_EXECUTABLE := /mnt/research/edgerpat_lab/Scotty/gffread  # use if on HPCC
 8 | DEV_DATA := $(realpath $(ROOT_DIR)/../../../TE_Density_Example_Data/Rice)
 9 | DEV_GLABERRIMA_DATA := $(DEV_DATA)/Oryza_Glaberrima
10 | DEV_GLABERRIMA_FASTA_DATA := $(DEV_GLABERRIMA_DATA)/Sequences
11 | DEV_GLABERRIMA_GENES := $(DEV_GLABERRIMA_DATA)/Genes/Oryza_glaberrima.Oryza_glaberrima_V1.50.gff3
12 | DEV_GLABERRIMA_TEs := $(DEV_GLABERRIMA_DATA)/TEs/Oryza_Glaberrima_NewNames.fasta.mod.EDTA.TEanno.gff3
13 | DEV_SATIVA_DATA := $(DEV_DATA)/Oryza_Sativa
14 | DEV_SATIVA_FASTA_DATA := $(DEV_SATIVA_DATA)/Sequences
15 | DEV_SATIVA_GENES := $(DEV_SATIVA_DATA)/Genes/Oryza_sativa.IRGSP-1.0.50.gff3
16 | DEV_SATIVA_TEs := $(DEV_SATIVA_DATA)/TEs/Oryza_Sativa_NewNames.fasta.mod.EDTA.TEanno.gff3
17 | DEV_FILTERED := $(realpath $(ROOT_DIR)/../../../TE_Data/filtered_input_data)
18 | DEV_HDF5 := $(realpath $(ROOT_DIR)/../../../TE_Data/finalized_data/10KB)
19 | DEV_RESULTS:= $(realpath $(ROOT_DIR)/results)
20 | 
21 | .PHONY: create_CDS fix_fasta_names fix_CDS_names
22 | 
23 | create_CDS:
24 | 	@echo
25 | 	@echo Creating CDS from GFF and fasta file for glaberrima
26 | 	$(DEV_GFF_READ_EXECUTABLE)/gffread -x $(DEV_GLABERRIMA_FASTA_DATA)/Oryza_glaberrima_CDS.fasta -g $(DEV_GLABERRIMA_FASTA_DATA)/Oryza_glaberrima.Oryza_glaberrima_V1.dna.toplevel.fa $(DEV_GLABERRIMA_GENES)
27 | 	@echo Creating CDS from GFF and fasta file for sativa
28 | 	$(DEV_GFF_READ_EXECUTABLE)/gffread -x $(DEV_SATIVA_FASTA_DATA)/Oryza_sativa_CDS.fasta -g $(DEV_SATIVA_FASTA_DATA)/Oryza_sativa.IRGSP-1.0.dna.toplevel.fa $(DEV_SATIVA_GENES)
29 | 	@echo
30 | 
31 | 
32 | fix_fasta_names:
33 | 	@echo
34 | 	@echo Fixing the fasta names for glaberrima so that they are not too long for EDTA
35 | 	python $(ROOT_DIR)/src/fix_fasta_names.py $(DEV_GLABERRIMA_FASTA_DATA)/Oryza_glaberrima.Oryza_glaberrima_V1.dna.toplevel.fa Oryza_Glaberrima
36 | 	@echo Fixing the fasta names for sativa so that they are not too long for EDTA
37 | 	python $(ROOT_DIR)/src/fix_fasta_names.py $(DEV_SATIVA_FASTA_DATA)/Oryza_sativa.IRGSP-1.0.dna.toplevel.fa Oryza_Sativa
38 | 	@echo
39 | 
40 | 
41 | 
42 | fix_CDS_names:
43 | 	@echo
44 | 	@echo Fixing the CDS fasta names for glaberrima so that they are not too long for EDTA
45 | 	python $(ROOT_DIR)/src/fix_cds_names.py $(DEV_GLABERRIMA_FASTA_DATA)/Oryza_glaberrima_CDS.fasta Oryza_Glaberrima
46 | 	@echo Fixing the CDS fasta names for sativa so that they are not too long for EDTA
47 | 	python $(ROOT_DIR)/src/fix_cds_names.py $(DEV_SATIVA_FASTA_DATA)/Oryza_sativa_CDS.fasta Oryza_Sativa
48 | 	@echo
49 | 
50 | run_EDTA_HPCC:
51 | 	@echo Running EDTA for glaberrima
52 | 	sbatch $(ROOT_DIR)/src/Annotate_EDTA_Rice_Glaberrima.sb
53 | 	@echo Running EDTA for sativa
54 | 	sbatch $(ROOT_DIR)/src/Annotate_EDTA_Rice_Sativa.sb
55 | 
56 | 
57 | filter_genes:
58 | 	@echo Filtering glaberrima genes into appropriate format for TE Density
59 | 	python $(ROOT_DIR)/src/import_rice_gene_anno.py $(DEV_GLABERRIMA_GENES)
60 | 	@echo Filtering sativa genes into appropriate format for TE Density
61 | 	python $(ROOT_DIR)/src/import_rice_gene_anno.py $(DEV_SATIVA_GENES)
62 | 
63 | filter_TEs:
64 | 	@echo Filtering blueberry TEs into appropriate format for TE Density
65 | 	python $(ROOT_DIR)/src/import_rice_EDTA.py $(DEV_GLABERRIMA_TEs)
66 | 	python $(ROOT_DIR)/src/import_rice_EDTA.py $(DEV_SATIVA_TEs)
67 | 
68 | run_TE_Density_HPCC:
69 | 	@echo Running TE Density for Glaberrima
70 | 	sbatch $(ROOT_DIR)/src/TE_Density_Glaberrima.sb
71 | 	sbatch $(ROOT_DIR)/src/TE_Density_Sativa.sb
72 | 
73 | filter_syntelogs:
74 | 	@echo Filtering syntelog file from SynMap to a cleaner version for downstream analysis
75 | 	mkdir -p $(ROOT_DIR)/results
76 | 	python $(ROOT_DIR)/src/generate_pairs.py $(DEV_DATA)/SynMap_Results/Glaberrima_VS_Sativa_SynMap.tsv -o $(ROOT_DIR)/results
77 | 
78 | 
79 | compare_syntelog_TE_differences:
80 | 	@echo  Generate graphs of syntelog TE differences
81 | 	@echo This is for chromosome 1
82 | 	mkdir -p $(ROOT_DIR)/results/graphs
83 | 	python $(ROOT_DIR)/src/compare_density.py $(DEV_RESULTS)/set_syntelogs.tsv $(DEV_HDF5)/Sativa/Sativa_1.h5 $(DEV_HDF5)/Glaberrima/Glaberrima_1.h5 $(DEV_FILTERED)/Cleaned_Oryza_sativa.IRGSP-1.0.50.tsv $(DEV_FILTERED)/Cleaned_Oryza_glaberrima.Oryza_glaberrima_V1.50.tsv -o $(ROOT_DIR)/results
84 | 
85 | # TODO Rename inputs for Rice
86 | identify_interesing_genes:
87 | 	@echo Reporting genes with interesting levels of TE density
88 | 	mkdir -p $(DEV_RESULTS)/tables
89 | 	python $(ROOT_DIR)/src/find_abnormal_genes.py $(DEV_FILTERED)/Cleaned_Oryza_sativa.IRGSP-1.0.50.tsv $(DEV_HDF5)/Sativa -o $(DEV_RESULTS)/tables
90 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/fix_fasta_names.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Reformat Fasta files for EDTA usage
  5 | """
  6 | 
  7 | __author__ = "Scott Teresi"
  8 | 
  9 | import argparse
 10 | import os
 11 | import logging
 12 | import coloredlogs
 13 | from Bio import SeqIO
 14 | 
 15 | 
 16 | def reformat_seq_iq(input_fasta, genome_name, output_dir, logger):
 17 |     """
 18 |     Reformat a regular FASTA file to have shorter sequence ID names for EDTA
 19 | 
 20 |     Args:
 21 |         input_fasta (str): String path to input fasta file
 22 | 
 23 |         genome_name (str): String for genome name
 24 | 
 25 |         output_dir (str): Path to output dir
 26 | 
 27 |         logger (logging.Logger): Object to log information to
 28 | 
 29 |     Returns:
 30 |         None, just saves the edited FASTA file to disk. Also writes a
 31 |         conversion table to disk for the old names and their new name
 32 |         counterparts
 33 |     """
 34 |     # MAGIC file suffixes
 35 |     new_fasta = os.path.join(output_dir, (genome_name + "_NewNames.fasta"))
 36 |     name_key = os.path.join(output_dir, (genome_name + "_Seq_ID_Conversion.txt"))
 37 | 
 38 |     if os.path.exists(new_fasta):
 39 |         os.remove(new_fasta)  # remove the file because we are in append mode
 40 |     if os.path.exists(name_key):
 41 |         os.remove(name_key)
 42 |     pair_dict = {}  # NB this is used to write the conversion key later for
 43 |     # clarity and note-taking
 44 | 
 45 |     # MAGIC we only want specific chromosomes
 46 |     chromosomes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, "Mt", "Pt"]
 47 |     chromosomes = [str(x) for x in chromosomes]  # redefine as string
 48 | 
 49 |     count = 0
 50 |     with open(input_fasta, "r") as input_fasta:
 51 |         for s_record in SeqIO.parse(input_fasta, "fasta"):
 52 |             if s_record.id in chromosomes:
 53 |                 # NB the s_record.id and s_record.description combined contain
 54 |                 # all the information for each entry following the '>' character
 55 |                 # in the fasta
 56 | 
 57 |                 # NB In this case:
 58 |                 # We just want the s_record.id which correctly points to the
 59 |                 # first integer. E.g '1 dna:chromosome blah blah blah' we just want
 60 |                 # 1.
 61 |                 pair_dict[s_record.id] = "\t " + s_record.description
 62 |                 s_record.description = ""  # NB edit the description so that when
 63 |                 # we rewrite we don't have the extraneous info
 64 |                 with open(new_fasta, "a") as output:
 65 |                     SeqIO.write(s_record, output, "fasta")
 66 |             else:
 67 |                 s_record.id = "Auxiliary_" + str(count)
 68 |                 pair_dict[s_record.id] = "\t " + s_record.description
 69 |                 s_record.description = ""  # NB edit the description so that when
 70 |                 # we rewrite we don't have the extraneous info
 71 |                 with open(new_fasta, "a") as output:
 72 |                     SeqIO.write(s_record, output, "fasta")
 73 |                 count += 1
 74 | 
 75 |     logger.info("Finished writing new fasta to: %s" % new_fasta)
 76 | 
 77 |     with open(name_key, "w") as output:
 78 |         for key, val in pair_dict.items():
 79 |             # Write the conversion table for record-keeping.
 80 |             output.write("%s\t%s\n" % (key, val))
 81 |     logger.info(
 82 |         "Finished writing name conversion table to: %s"
 83 |         % os.path.join(output_dir, name_key)
 84 |     )
 85 | 
 86 | 
 87 | if __name__ == "__main__":
 88 | 
 89 |     path_main = os.path.abspath(__file__)
 90 |     dir_main = os.path.dirname(path_main)
 91 |     parser = argparse.ArgumentParser(description="Reformat FASTA for EDTA")
 92 | 
 93 |     parser.add_argument("fasta_input_file", type=str, help="parent path of fasta file")
 94 |     parser.add_argument("genome_id", type=str, help="name of genome")
 95 |     output_default = os.path.join(
 96 |         dir_main, "../../../../", "TE_Density_Example_Data/Rice"
 97 |     )
 98 |     parser.add_argument(
 99 |         "--output_dir",
100 |         "-o",
101 |         type=str,
102 |         default=output_default,
103 |         help="Parent directory to output results",
104 |     )
105 |     parser.add_argument(
106 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
107 |     )
108 |     args = parser.parse_args()
109 |     args.fasta_input_file = os.path.abspath(args.fasta_input_file)
110 |     args.output_dir = os.path.join(args.output_dir, args.genome_id, "Sequences")
111 | 
112 |     log_level = logging.DEBUG if args.verbose else logging.INFO
113 |     logger = logging.getLogger(__name__)
114 |     coloredlogs.install(level=log_level)
115 | 
116 |     reformat_seq_iq(args.fasta_input_file, args.genome_id, args.output_dir, logger)
117 | 


--------------------------------------------------------------------------------
/examples/Arabidopsis/src/import_Arabidopsis_gene_anno.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Filter a gene annotation file for the TE Density algorithm
  3 | """
  4 | 
  5 | __author__ = "Scott Teresi"
  6 | 
  7 | import pandas as pd
  8 | import argparse
  9 | import os
 10 | import logging
 11 | import coloredlogs
 12 | 
 13 | from transposon import check_nulls
 14 | 
 15 | 
 16 | def write_cleaned_genes(gene_pandaframe, output_dir, old_filename, logger):
 17 |     file_name = os.path.join(
 18 |         output_dir,
 19 |         ("Cleaned_" + os.path.splitext(os.path.basename(old_filename))[0]) + ".tsv",
 20 |     )  # MAGIC to get proper extension
 21 | 
 22 |     logger.info("Writing cleaned gene file to: %s" % file_name)
 23 |     gene_pandaframe.to_csv(file_name, sep="\t", header=True, index=True)
 24 | 
 25 | 
 26 | def import_genes(genes_input_path, logger):
 27 |     """Import genes file.
 28 | 
 29 |     Args:
 30 |         input_dir (command line argument) Specify the input directory of the gene
 31 |         annotation data, this is the same as the TE annotation directory
 32 |     """
 33 | 
 34 |     col_names = [
 35 |         "Chromosome",
 36 |         "Software",
 37 |         "Feature",
 38 |         "Start",
 39 |         "Stop",
 40 |         "Score",
 41 |         "Strand",
 42 |         "Frame",
 43 |         "FullName",
 44 |     ]
 45 | 
 46 |     col_to_use = [
 47 |         "Chromosome",
 48 |         "Software",
 49 |         "Feature",
 50 |         "Start",
 51 |         "Stop",
 52 |         "Strand",
 53 |         "FullName",
 54 |     ]
 55 | 
 56 |     gene_data = pd.read_csv(
 57 |         genes_input_path,
 58 |         sep="\t+",
 59 |         header=None,
 60 |         engine="python",
 61 |         names=col_names,
 62 |         usecols=col_to_use,
 63 |         dtype={
 64 |             "Stop": "float64",
 65 |             "Start": "float64",
 66 |             "Chromosome": str,
 67 |             "Strand": str,
 68 |             "Fullname": str,
 69 |             "Feature": str,
 70 |             "Software": str,
 71 |         },
 72 |         comment="#",
 73 |     )
 74 | 
 75 |     # rows in annotation
 76 |     gene_data = gene_data[gene_data.Feature == "gene"]  # drop non-gene rows
 77 |     # gene_data.reset_index(inplace=True)  # reset index so we can have proper
 78 | 
 79 |     gene_data["Gene_Name"] = gene_data["FullName"].str.extract(r"ID=(.*?);")
 80 |     gene_data = gene_data.drop(columns=["FullName", "Software"])
 81 |     gene_data["Length"] = gene_data.Stop - gene_data.Start + 1
 82 | 
 83 |     gene_data.sort_values(by=["Chromosome", "Start"], inplace=True)
 84 |     check_nulls(gene_data, logger)
 85 |     gene_data = drop_nulls(gene_data, logger)
 86 | 
 87 |     # Set the gene name as the index
 88 |     gene_data.set_index("Gene_Name", inplace=True)
 89 | 
 90 |     return gene_data
 91 | 
 92 | 
 93 | def get_nulls(my_df, logger):
 94 |     """
 95 |     Print out the row IDs where the null values exist
 96 | 
 97 |     Args:
 98 |         my_df (Pandaframes): Pandaframe to check null values in
 99 |         logger
100 |     """
101 |     nas = my_df[my_df.isna().any(axis=1)]
102 |     logger.warning("Rows where null exist: %s" % nas)
103 | 
104 | 
105 | def drop_nulls(my_df, logger):
106 |     """
107 |     Drop null values inside a Pandaframe
108 | 
109 |     Args:
110 |         my_df (Pandaframes): Pandaframe to drop null values
111 |     """
112 |     nas = my_df[my_df.isna().any(axis=1)]
113 |     if not nas.empty:
114 |         logger.warning("Dropping rows with at least one Null value!")
115 |         my_df = my_df.dropna(axis=0, how="any")
116 |     return my_df
117 | 
118 | 
119 | if __name__ == "__main__":
120 | 
121 |     parser = argparse.ArgumentParser(description="Reformat gene annotation file")
122 |     path_main = os.path.abspath(__file__)
123 |     dir_main = os.path.dirname(path_main)
124 |     output_default = os.path.join(
125 |         dir_main, "../../../../", "TE_Data/filtered_input_data"
126 |     )
127 |     parser.add_argument(
128 |         "gene_input_file", type=str, help="Parent path of gene annotation file"
129 |     )
130 | 
131 |     parser.add_argument(
132 |         "--output_dir",
133 |         "-o",
134 |         type=str,
135 |         default=output_default,
136 |         help="Parent directory to output results",
137 |     )
138 | 
139 |     parser.add_argument(
140 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
141 |     )
142 | 
143 |     args = parser.parse_args()
144 |     args.gene_input_file = os.path.abspath(args.gene_input_file)
145 |     args.output_dir = os.path.abspath(args.output_dir)
146 | 
147 |     log_level = logging.DEBUG if args.verbose else logging.INFO
148 |     logger = logging.getLogger(__name__)
149 |     coloredlogs.install(level=log_level)
150 | 
151 |     # Execute
152 |     cleaned_genes = import_genes(args.gene_input_file, logger)
153 |     write_cleaned_genes(cleaned_genes, args.output_dir, args.gene_input_file, logger)
154 | 


--------------------------------------------------------------------------------
/transposon/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Sundry helper functions.
  3 | """
  4 | 
  5 | # FUTURE according to best practices, move all this to it's own utils namespace?
  6 | 
  7 | import logging
  8 | import errno
  9 | from functools import partial
 10 | from os import sysconf, strerror
 11 | import os
 12 | import h5py
 13 | import numexpr  # used by numpy
 14 | 
 15 | MAX_SYSTEM_RAM_GB = sysconf("SC_PAGE_SIZE") * sysconf("SC_PHYS_PAGES") / (1024.0 ** 3)
 16 | FILE_DNE = partial(FileNotFoundError, errno.ENOENT, strerror(errno.ENOENT))
 17 | 
 18 | 
 19 | def set_numexpr_threads(n_threads=None):
 20 |     """Set number of threads for use in Numpy/Pandas NumExpr.
 21 | 
 22 |     NumExpr uses a default of the numexpr.detect_number_of_cores().
 23 |     This appears to be the number of hyperthreads.
 24 |     Calling this will prevent numexpr from making a log call at startup.
 25 |     """
 26 | 
 27 |     n_threads = n_threads or numexpr.detect_number_of_cores()
 28 |     numexpr.set_num_threads(n_threads)
 29 | 
 30 | 
 31 | def raise_if_no_file(filepath, logger=None, msg_fmt=None):
 32 |     """Raise FileNotFoundError if file does not exist."""
 33 | 
 34 |     logger = logger or logging.getLogger(__name__)
 35 |     msg_fmt = msg_fmt or "not a file:  %s"
 36 |     if not os.path.isfile(filepath):
 37 |         logger.critical(msg_fmt % filepath)
 38 |         raise FILE_DNE(filepath)
 39 | 
 40 | 
 41 | def raise_if_no_dir(filepath, logger=None, msg_fmt=None):
 42 |     """Raise FileNotFoundError if file does not exist and it's not a directory."""
 43 | 
 44 |     logger = logger or logging.getLogger(__name__)
 45 |     msg_fmt = "not a directory:  %s"
 46 |     if not os.path.isdir(filepath):
 47 |         logger.critical(msg_fmt % filepath)
 48 |         raise FILE_DNE(filepath)
 49 | 
 50 | 
 51 | def check_ram(ram_bytes, logger):
 52 |     """Raise if the requested RAM is negative or greater than the system."""
 53 | 
 54 |     if ram_bytes < 0:
 55 |         logger.critical("cache %i bytes < 0" % ram_bytes)
 56 |         raise ValueError()
 57 |     if ram_bytes / (1024.0 ** 3) > MAX_SYSTEM_RAM_GB:
 58 |         ram_gb = ram_bytes / (1024.0 ** 3)
 59 |         msg = "cache %i GB > system %i GB" % ram_gb
 60 |         logger.critical(msg)
 61 |         raise ValueError(msg)
 62 | 
 63 | 
 64 | def write_vlen_str_h5py(h5file, strings, dataset_key):
 65 |     """Write to an H5 File an iterable of variable length unicode.
 66 | 
 67 |     Args:
 68 |         h5file(h5py.File): opened destination file
 69 |         strings(iterable(str)): string list
 70 |         dataset_key(str): name of data set to write
 71 |     """
 72 | 
 73 |     vlen = h5py.special_dtype(vlen=str)
 74 |     n_strings = sum(1 for s in strings)
 75 |     dset = h5file.create_dataset(dataset_key, (n_strings,), dtype=vlen)
 76 |     dset[:] = [str(s) for s in strings]
 77 | 
 78 | 
 79 | def read_vlen_str_h5py(h5file, dataset_key):
 80 |     """Read from an H5 File an iterable of variable length unicode.
 81 | 
 82 |     Args:
 83 |         h5ile(h5py.File): opened source file
 84 |         dataset_key(str): name of data set to read
 85 |     """
 86 | 
 87 |     return h5file[dataset_key][:].tolist()
 88 | 
 89 | 
 90 | def check_strand(my_df, logger):
 91 |     """
 92 |     In the gene annotation (GFF format), check to see if the user provided
 93 |     incoherent values and raise an error.
 94 |     Values should only be '-' or '+' or '.'. Will raise a logger info if '.'
 95 |     because we will assume that the gene is '+', because we HAVE to choose a
 96 |     direction for the gene to be orientated.
 97 |     """
 98 |     acceptable_values = ["-", "+", "."]
 99 |     # Check if you have any values in Strand that are not part of the whitelist
100 |     if ~my_df["Strand"].isin(acceptable_values).all():
101 |         unacceptable_values = my_df["Strand"].unique()
102 |         unacceptable_values = [
103 |             item for item in unacceptable_values if item not in acceptable_values
104 |         ]
105 |         logger.critical(
106 |             """
107 |             You have values in your gene annotation that do not
108 |             conform to the GFF file format. Please fix this. Your unacceptable
109 |             strand values are: %s
110 |             """
111 |             % unacceptable_values
112 |         )
113 |         raise ValueError
114 | 
115 |     # NOTE separate check for '.'
116 |     if my_df["Strand"].isin(["."]).any():
117 |         logger.warning(
118 |             """
119 |             You have rows in your gene annotation that have '.' as
120 |             a Strand column value. Please note that we will treat
121 |             these genes as sense orientation for the purposes of
122 |             calculating TE Density.
123 | 
124 |             %s
125 |             """
126 |             % my_df.loc[my_df["Strand"] == "."]
127 |         )
128 | 
129 | 
130 | def check_nulls(my_df, logger):
131 |     """
132 |     Check for rows with an null values in the supplied Pandas DataFrame
133 | 
134 |     Args:
135 |         my_df (Pandas DataFrame):
136 | 
137 |         logger ():
138 | 
139 |     """
140 |     # NB check for NAN and report to user
141 |     nas = my_df[my_df.isna().any(axis=1)]
142 |     if not nas.empty:
143 |         logger.warning("Rows where null exist: %s" % nas)
144 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/import_rice_EDTA.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Filter a EDTA-created TE annotation to the appropriate format for TE
  3 | Density algorithm
  4 | """
  5 | 
  6 | __author__ = "Scott Teresi"
  7 | 
  8 | import pandas as pd
  9 | import argparse
 10 | import os
 11 | import logging
 12 | import coloredlogs
 13 | 
 14 | from examples.Rice_Synteny.src.replace_names_rice import te_annot_renamer
 15 | 
 16 | 
 17 | def check_nulls(my_df, logger):
 18 |     """Check the TE dataframe for ANY null values in ANY rows
 19 | 
 20 |     Args:
 21 |         my_df (pandas.core.DataFrame): Pandas dataframe of TE values from TE
 22 |             annotation
 23 |     """
 24 |     Bool = my_df.isnull().values.any()
 25 |     if Bool:
 26 |         logger.critical("You have null values in your dataframe!")
 27 |         logger.critical("Here are the null values in the output:")
 28 |         null_columns = my_df.columns[my_df.isnull().any()]
 29 |         logger.info(my_df[my_df.isnull().any(axis=1)][null_columns].head())
 30 | 
 31 | 
 32 | def write_cleaned_transposons(te_pandaframe, output_dir, old_filename, logger):
 33 |     file_name = os.path.join(
 34 |         output_dir,
 35 |         ("Cleaned_" + os.path.splitext(os.path.basename(old_filename))[0]) + ".tsv",
 36 |     )  # MAGIC to get proper extension
 37 | 
 38 |     logger.info("Writing cleaned TE file to: %s" % file_name)
 39 |     te_pandaframe.to_csv(file_name, sep="\t", header=True, index=False)
 40 | 
 41 | 
 42 | def import_transposons(tes_input_path, te_annot_renamer, logger):
 43 |     """Import TE file and read as a dataframe in Pandas
 44 | 
 45 |     Args:
 46 |         tes_input_path (str): string of the file path to the TE annotation
 47 | 
 48 |         logger (logging obj): The object to call logs and info
 49 |     """
 50 |     col_names = [
 51 |         "Chromosome",
 52 |         "Software",
 53 |         "Feature",
 54 |         "Start",
 55 |         "Stop",
 56 |         "Score",
 57 |         "Strand",
 58 |         "Phase",
 59 |         "Attribute",
 60 |     ]
 61 | 
 62 |     te_pandaframe = pd.read_csv(
 63 |         tes_input_path,
 64 |         sep="\t+",
 65 |         header=None,
 66 |         engine="python",
 67 |         names=col_names,
 68 |         comment="#",
 69 |         dtype={"Start": "float64", "Stop": "float64"},
 70 |     )
 71 | 
 72 |     # Drop extraneous columns
 73 |     te_pandaframe.drop(columns=["Score", "Software", "Phase", "Feature"], inplace=True)
 74 | 
 75 |     # Create Order and SuperFamily column from Attribute column
 76 |     # Because that column contains the detailed TE information
 77 |     # Then remove old Attribute column
 78 |     te_pandaframe["Attribute"] = te_pandaframe["Attribute"].str.extract(
 79 |         r"Classification=(.*?);"
 80 |     )
 81 |     te_pandaframe[["Order", "SuperFamily"]] = te_pandaframe.Attribute.str.split(
 82 |         "/", expand=True
 83 |     )
 84 |     te_pandaframe.drop(columns=["Attribute"], inplace=True)
 85 |     te_pandaframe.Order = te_pandaframe.Order.astype(str)
 86 |     te_pandaframe.SuperFamily = te_pandaframe.SuperFamily.astype(str)
 87 |     te_pandaframe.Strand = te_pandaframe.Strand.astype(str)
 88 | 
 89 |     # Call renamer
 90 |     te_pandaframe = te_annot_renamer(te_pandaframe)
 91 | 
 92 |     # Declare data types
 93 |     te_pandaframe["Length"] = te_pandaframe.Stop - te_pandaframe.Start + 1
 94 |     check_nulls(te_pandaframe, logger)
 95 | 
 96 |     te_pandaframe.sort_values(by=["Chromosome", "Start"], inplace=True)
 97 | 
 98 |     # MAGIC I only want the first 12 chromosomes
 99 |     chromosomes_i_want = [str(i) for i in range(1, 12 + 1)]  # MAGIC plus 1 bc range
100 |     # NB, chromosomes_i_want must be string
101 |     te_pandaframe = te_pandaframe.loc[
102 |         te_pandaframe["Chromosome"].isin(chromosomes_i_want)
103 |     ]
104 |     te_pandaframe.sort_values(by=["Chromosome", "Start"], inplace=True)
105 | 
106 |     return te_pandaframe
107 | 
108 | 
109 | if __name__ == "__main__":
110 | 
111 |     parser = argparse.ArgumentParser(description="Reformat TE annotation file")
112 |     path_main = os.path.abspath(__file__)
113 |     dir_main = os.path.dirname(path_main)
114 |     output_default = os.path.join(
115 |         dir_main, "../../../../", "TE_Data/filtered_input_data"
116 |     )
117 |     parser.add_argument(
118 |         "TE_input_file", type=str, help="Parent path of TE annotation file"
119 |     )
120 | 
121 |     parser.add_argument(
122 |         "--output_dir",
123 |         "-o",
124 |         type=str,
125 |         default=output_default,
126 |         help="Parent directory to output results",
127 |     )
128 | 
129 |     parser.add_argument(
130 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
131 |     )
132 | 
133 |     args = parser.parse_args()
134 |     args.TE_input_file = os.path.abspath(args.TE_input_file)
135 |     args.output_dir = os.path.abspath(args.output_dir)
136 | 
137 |     log_level = logging.DEBUG if args.verbose else logging.INFO
138 |     logger = logging.getLogger(__name__)
139 |     coloredlogs.install(level=log_level)
140 | 
141 |     # Execute
142 |     cleaned_transposons = import_transposons(
143 |         args.TE_input_file, te_annot_renamer, logger
144 |     )
145 |     write_cleaned_transposons(
146 |         cleaned_transposons, args.output_dir, args.TE_input_file, logger
147 |     )
148 | 


--------------------------------------------------------------------------------
/tests/test_transposon_single_chrom.tsv:
--------------------------------------------------------------------------------
 1 | Fvb1-1	RepeatMasker	DNA/Mutator	3350	4133	5.0	+	4992	Fxa_V1_4_26930
 2 | Fvb1-1	RepeatMasker	LTR/Gypsy	4209	4873	11.7	-	2969	Fvb4-4:6485593..6486917_LTR
 3 | Fvb1-1	RepeatMasker	LTR/Gypsy	4871	5232	16.0	-	1185	Fvb4-4:6485593..6486917_LTR
 4 | Fvb1-1	RepeatMasker	LTR/unknown	15404	16346	19.1	-	3262	Fvb1-1:9463282..9465926_LTR
 5 | Fvb1-1	RepeatMasker	LTR/unknown	16347	20224	3.8	-	30004	Fvb3-2:23517681..23521581_INT-int
 6 | Fvb1-1	RepeatMasker	LTR/unknown	20225	22848	15.4	-	12678	Fvb1-1:9463282..9465926_LTR
 7 | Fvb1-1	RepeatMasker	LTR/Gypsy	23450	24094	20.3	-	2636	Fvb1-1:5519794..5522056_LTR
 8 | Fvb1-1	RepeatMasker	LTR/unknown	24133	24415	19.0	-	1082	Fvb1-3:25198782..25200749_LTR
 9 | Fvb1-1	RepeatMasker	DNA/CMC-EnSpm	24412	24910	8.5	+	2257	family-72
10 | Fvb1-1	RepeatMasker	DNA/CMC-EnSpm	25024	25752	18.8	+	1671	family-72
11 | Fvb1-1	RepeatMasker	DNA/CMC-EnSpm	25751	25840	21.4	+	351	family-72
12 | Fvb1-1	RepeatMasker	LTR/unknown	25842	27393	5.9	+	10968	Fvb6-4:17811941..17814568_LTR
13 | Fvb1-1	RepeatMasker	LTR/Gypsy	27398	27999	19.6	+	1680	Fvb4-2:14377692..14383914_LTR
14 | Fvb1-1	RepeatMasker	LTR/Gypsy	28097	28708	21.1	+	1651	Fvb4-2:14377692..14383914_LTR
15 | Fvb1-1	RepeatMasker	LTR/unknown	28709	29048	4.4	-	2681	Fvb6-3:14035592..14036822_LTR
16 | Fvb1-1	RepeatMasker	LTR/unknown	29051	30034	5.9	-	7242	Fvb7-3:21810512..21811992_LTR
17 | Fvb1-1	RepeatMasker	LTR/unknown	30038	30447	20.0	+	1455	Fvb1-2:27872851..27875385_LTR
18 | Fvb1-1	RepeatMasker	LTR/Gypsy	30440	31593	15.4	-	5517	Fvb1-1:1901529..1903797_LTR
19 | Fvb1-1	RepeatMasker	LTR/Gypsy	30986	31928	12.8	+	3981	Fvb1-2:23736536..23738711_LTR
20 | Fvb1-1	RepeatMasker	LTR/Gypsy	32011	33581	7.7	+	11550	Fvb6-4:12909946..12918059_INT-int
21 | Fvb1-1	RepeatMasker	LTR/unknown	33582	33661	11.2	-	524	Fvb4-1:5166560..5169132_LTR
22 | Fvb1-1	RepeatMasker	LTR/unknown	33646	33720	17.3	+	409	Fvb1-3:21882267..21885047_LTR
23 | Fvb1-1	RepeatMasker	LTR/Gypsy	33721	34100	11.8	+	2503	Fvb4-2:19625958..19630035_INT-int
24 | Fvb1-1	RepeatMasker	LTR/Gypsy	34101	36264	20.8	+	7112	Fvb3-1:4132469..4134698_LTR
25 | Fvb1-1	RepeatMasker	LTR/Gypsy	36265	38219	20.9	+	6733	Fvb5-1:12085737..12087980_LTR
26 | Fvb1-1	RepeatMasker	LTR/Gypsy	38514	40380	19.2	+	6488	Fvb6-2:9966870..9969479_LTR
27 | Fvb1-1	RepeatMasker	LTR/unknown	40383	41342	5.3	-	7215	Fvb1-2:20827613..20830221_LTR
28 | Fvb1-1	RepeatMasker	LTR/Gypsy	41343	41699	12.5	-	7196	Fvb1-1:1901529..1903797_LTR
29 | Fvb1-1	RepeatMasker	LTR/Gypsy	41469	42958	8.3	+	9764	Fvb1-4:19462751..19465717_LTR
30 | Fvb1-1	RepeatMasker	LTR/unknown	42959	45118	8.4	+	14420	Fvb6-4:10839685..10850190_INT-int
31 | Fvb1-1	RepeatMasker	LTR/unknown	45111	45191	3.7	+	522	Fvb4-3:22936782..22939826_LTR
32 | Fvb1-1	RepeatMasker	LTR/unknown	45192	46897	2.9	+	13508	Fvb1-2:20527699..20531500_INT-int
33 | Fvb1-1	RepeatMasker	LTR/unknown	46898	46996	2.0	-	816	Fvb7-1:9481811..9484274_LTR
34 | Fvb1-1	RepeatMasker	LTR/Gypsy	47028	47165	21.3	-	248	Fvb6-2:18470852..18482588_INT-int
35 | Fvb1-1	RepeatMasker	LTR/unknown	47817	47844	21.9	+	503	Fvb5-3:23020132..23022075_LTR
36 | Fvb1-1	RepeatMasker	LTR/unknown	47845	49893	24.3	-	2950	Fvb1-3:25198782..25200749_LTR
37 | Fvb1-1	RepeatMasker	LTR/Gypsy	49946	50601	24.6	+	1906	Fvb1-1:5519794..5522056_LTR
38 | Fvb1-1	RepeatMasker	LTR/unknown	50156	50664	29.4	+	935	Fvb1-2:20525092..20527698_LTR
39 | Fvb1-1	RepeatMasker	LTR/unknown	50657	51121	12.8	+	2484	Fvb1-2:20525092..20527698_LTR
40 | Fvb1-1	RepeatMasker	LTR/unknown	51147	51699	19.2	+	1656	Fvb4-3:22936782..22939826_LTR
41 | Fvb1-1	RepeatMasker	LTR/unknown	51286	51946	17.5	-	2412	Fvb1-2:20827613..20830221_LTR
42 | Fvb1-1	RepeatMasker	LTR/unknown	51947	52077	23.1	+	450	Fvb5-3:23020132..23022075_LTR
43 | Fvb1-1	RepeatMasker	LTR/Gypsy	52286	54424	8.6	+	13432	Fvb4-4:12509905..12512100_LTR
44 | Fvb1-1	RepeatMasker	LTR/Gypsy	55202	55513	14.0	-	1651	Fvb4-3:23640595..23654748_INT-int
45 | Fvb1-1	RepeatMasker	DNA/CMC-EnSpm	55515	56340	4.8	+	3890	family-111
46 | Fvb1-1	RepeatMasker	DNA/CMC-EnSpm	56341	59966	3.0	+	29386	family-2766
47 | Fvb1-1	RepeatMasker	LTR/Copia	59974	60078	13.5	+	272	Fvb4-4:16016909..16022681_INT-int
48 | Fvb1-1	RepeatMasker	LTR/Gypsy	59992	60163	4.7	-	1301	Fvb5-4:23958992..23969121_INT-int
49 | Fvb1-1	RepeatMasker	LTR/unknown	60093	60476	27.1	+	1722	Fvb4-3:17071041..17071685_LTR
50 | Fvb1-1	RepeatMasker	LTR/Gypsy	60477	60942	31.8	-	700	Fvb7-3:16501565..16510482_INT-int
51 | Fvb1-1	RepeatMasker	LTR/unknown	60943	61397	27.1	+	1722	Fvb4-3:17071041..17071685_LTR
52 | Fvb1-1	RepeatMasker	LTR/Gypsy	61398	61628	11.8	+	931	Fvb2-1:23595237..23608764_INT-int
53 | Fvb1-1	RepeatMasker	LTR/Copia	61404	61441	27.0	+	423	Fvb6-3:43298293..43305898_INT-int
54 | Fvb1-1	RepeatMasker	LTR/unknown	61442	61648	10.0	-	853	Fvb6-3:34524870..34533364_INT-int
55 | Fvb1-1	RepeatMasker	LTR/unknown	61530	63620	4.4	+	14894	Fvb7-1:23134486..23142785_INT-int
56 | Fvb1-1	RepeatMasker	LTR/unknown	63614	63848	4.3	+	1862	Fvb7-1:23134486..23142785_INT-int
57 | Fvb1-1	RepeatMasker	LTR/unknown	63849	64128	2.9	-	2311	Fvb1-1:2391630..2394332_LTR
58 | Fvb1-1	RepeatMasker	LTR/Gypsy	64114	65314	1.3	-	9433	Fvb1-1:6599210..6612319_INT-int
59 | Fvb1-1	RepeatMasker	LTR/unknown	65249	65323	8.0	-	411	Fvb5-4:16009682..16017344_INT-int
60 | Fvb1-1	RepeatMasker	LTR/Gypsy	65312	65583	11.8	+	1560	Fvb6-2:9966870..9969479_LTR
61 | Fvb1-1	RepeatMasker	LTR/unknown	65584	66946	1.3	+	11377	Fvb1-2:10116627..10126059_INT-int
62 | Fvb1-1	RepeatMasker	LTR/Gypsy	66939	67226	11.7	+	1300	Fvb6-3:20161303..20161571_LTR
63 | 


--------------------------------------------------------------------------------
/examples/Human/src/import_human_gene_anno.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Filter a gene annotation file for the TE Density algorithm
  3 | """
  4 | 
  5 | __author__ = "Scott Teresi"
  6 | 
  7 | import pandas as pd
  8 | import argparse
  9 | import os
 10 | import logging
 11 | import coloredlogs
 12 | 
 13 | from transposon import check_nulls
 14 | 
 15 | 
 16 | def write_cleaned_genes(gene_pandaframe, output_dir, genome_name, logger):
 17 |     file_name = os.path.join(
 18 |         output_dir, ("Cleaned_Chr7_13_" + genome_name + "_Genes.tsv")
 19 |     )
 20 | 
 21 |     logger.info("Writing cleaned gene file to: %s" % file_name)
 22 |     gene_pandaframe.to_csv(file_name, sep="\t", header=True, index=True)
 23 | 
 24 | 
 25 | def import_genes(genes_input_path, logger):
 26 |     """Import genes file.
 27 | 
 28 |     Args:
 29 |         input_dir (command line argument) Specify the input directory of the gene
 30 |         annotation data, this is the same as the TE annotation directory
 31 |     """
 32 | 
 33 |     col_names = [
 34 |         "Chromosome",
 35 |         "Software",
 36 |         "Feature",
 37 |         "Start",
 38 |         "Stop",
 39 |         "Score",
 40 |         "Strand",
 41 |         "Frame",
 42 |         "FullName",
 43 |     ]
 44 | 
 45 |     col_to_use = [
 46 |         "Chromosome",
 47 |         "Software",
 48 |         "Feature",
 49 |         "Start",
 50 |         "Stop",
 51 |         "Strand",
 52 |         "FullName",
 53 |     ]
 54 | 
 55 |     gene_data = pd.read_csv(
 56 |         genes_input_path,
 57 |         sep="\t+",
 58 |         header=None,
 59 |         engine="python",
 60 |         names=col_names,
 61 |         usecols=col_to_use,
 62 |         dtype={
 63 |             "Stop": "float64",
 64 |             "Start": "float64",
 65 |             "Chromosome": str,
 66 |             "Strand": str,
 67 |             "Fullname": str,
 68 |             "Feature": str,
 69 |             "Software": str,
 70 |         },
 71 |         comment="#",
 72 |     )
 73 | 
 74 |     # rows in annotation
 75 |     gene_data = gene_data[gene_data.Feature == "gene"]  # drop non-gene rows
 76 |     gene_data["Gene_Name"] = gene_data["FullName"].str.extract(r";gene_name=(.*?);")
 77 | 
 78 |     # NOTE
 79 |     gene_data.drop_duplicates(subset=["Gene_Name"], keep=False, inplace=True)
 80 |     # Drop duplicate gene  names for the human data set, will add explicit
 81 |     # function to fix duplicate gene names in future so that the code doesn't
 82 |     # crash (Crashes TE_Density due to shape error). TODO add error handling
 83 |     # and elegant renaming function to fix duplicate gene names, not relevant
 84 |     # to this example though
 85 | 
 86 |     gene_data = gene_data.drop(columns=["FullName", "Software"])
 87 | 
 88 |     gene_data.Strand = gene_data.Strand.astype(str)
 89 | 
 90 |     gene_data["Length"] = gene_data.Stop - gene_data.Start + 1
 91 | 
 92 |     # MAGIC I only want the 7th and 13th chromosome
 93 |     chromosomes_i_want = ["chr7", "chr13"]
 94 |     gene_data = gene_data.loc[gene_data["Chromosome"].isin(chromosomes_i_want)]
 95 | 
 96 |     gene_data.sort_values(by=["Chromosome", "Start"], inplace=True)
 97 |     check_nulls(gene_data, logger)
 98 |     gene_data = drop_nulls(gene_data, logger)
 99 |     gene_data.set_index("Gene_Name", inplace=True)
100 | 
101 |     return gene_data
102 | 
103 | 
104 | def drop_nulls(my_df, logger):
105 |     """
106 |     Drop null values inside a PandaFrame
107 | 
108 |     Args:
109 |         my_df (Pandas.Data.Frame):
110 | 
111 |     Returns:
112 |         my_df (Pandas.Data.Frame): Without the offending rows containing null
113 |         values, also has a call to the logger to show the user which rows were
114 |         gotten rid of
115 | 
116 |     """
117 |     nas = my_df[my_df.isna().any(axis=1)]
118 |     if not nas.empty:
119 |         logger.warning("Dropping rows with at least one Null value!")
120 |         my_df = my_df.dropna(axis=0, how="any")
121 |     return my_df
122 | 
123 | 
124 | def return_duplicate_indices(dataframe):
125 |     return dataframe[dataframe.index.duplicated(keep=False)]
126 | 
127 | 
128 | if __name__ == "__main__":
129 | 
130 |     parser = argparse.ArgumentParser(description="Reformat gene annotation file")
131 |     path_main = os.path.abspath(__file__)
132 |     dir_main = os.path.dirname(path_main)
133 |     output_default = os.path.join(
134 |         dir_main, "../../../../", "TE_Data/filtered_input_data"
135 |     )
136 | 
137 |     parser.add_argument(
138 |         "gene_input_file", type=str, help="Parent path of gene annotation file"
139 |     )
140 | 
141 |     parser.add_argument(
142 |         "--output_dir",
143 |         "-o",
144 |         type=str,
145 |         default=output_default,
146 |         help="Parent directory to output results",
147 |     )
148 | 
149 |     parser.add_argument(
150 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
151 |     )
152 | 
153 |     args = parser.parse_args()
154 |     args.gene_input_file = os.path.abspath(args.gene_input_file)
155 |     args.output_dir = os.path.abspath(args.output_dir)
156 | 
157 |     log_level = logging.DEBUG if args.verbose else logging.INFO
158 |     logger = logging.getLogger(__name__)
159 |     coloredlogs.install(level=log_level)
160 | 
161 |     # Execute
162 |     cleaned_genes = import_genes(args.gene_input_file, logger)
163 |     write_cleaned_genes(cleaned_genes, args.output_dir, "Human", logger)
164 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/find_abnormal_genes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env/python
  2 | 
  3 | """
  4 | Execute graphing commands
  5 | """
  6 | 
  7 | __author__ = "Scott Teresi"
  8 | 
  9 | import argparse
 10 | import os
 11 | import logging
 12 | import coloredlogs
 13 | import numpy as np
 14 | import pandas as pd
 15 | 
 16 | from transposon.gene_data import GeneData
 17 | from transposon.density_data import DensityData
 18 | from transposon.import_filtered_genes import import_filtered_genes
 19 | 
 20 | 
 21 | if __name__ == "__main__":
 22 |     path_main = os.path.abspath(__file__)
 23 |     dir_main = os.path.dirname(path_main)
 24 |     output_default = os.path.abspath(os.path.join(dir_main, "../", "results/graphs"))
 25 |     parser = argparse.ArgumentParser(description="generate graphs")
 26 | 
 27 |     parser.add_argument(
 28 |         "o_sativa_gene_data",
 29 |         type=str,
 30 |         help="parent path to Arabidopsis' filtered gene data file",
 31 |     )
 32 | 
 33 |     parser.add_argument(
 34 |         "sativa_density_data_dir",
 35 |         type=str,
 36 |         help="Parent path of folders containing TE Density results",
 37 |     )
 38 | 
 39 |     parser.add_argument(
 40 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
 41 |     )
 42 |     parser.add_argument(
 43 |         "--output_dir",
 44 |         "-o",
 45 |         type=str,
 46 |         default=output_default,
 47 |         help="parent directory to output results",
 48 |     )
 49 |     args = parser.parse_args()
 50 |     args.o_sativa_gene_data = os.path.abspath(args.o_sativa_gene_data)
 51 |     args.sativa_density_data_dir = os.path.abspath(args.sativa_density_data_dir)
 52 |     log_level = logging.DEBUG if args.verbose else logging.INFO
 53 |     logger = logging.getLogger(__name__)
 54 |     coloredlogs.install(level=log_level)
 55 | 
 56 |     # Begin reading files:
 57 |     # Get the genes:
 58 |     cleaned_genes = import_filtered_genes(args.o_sativa_gene_data, logger)
 59 |     gene_dataframe_list = [
 60 |         dataframe for k, dataframe in cleaned_genes.groupby("Chromosome")
 61 |     ]
 62 |     # NB MAGIC get unique chromosome ID
 63 |     gene_data_list = [
 64 |         GeneData(dataframe, dataframe["Chromosome"].unique()[0])
 65 |         for dataframe in gene_dataframe_list
 66 |     ]
 67 |     processed_dd_data = DensityData.from_list_gene_data_and_hdf5_dir(
 68 |         gene_data_list, args.sativa_density_data_dir, "Sativa_(.*?).h5", logger
 69 |     )
 70 | 
 71 |     # NOTE
 72 |     cleaned_genes.reset_index(inplace=True)  # necessary
 73 |     # cleaned_genes = cleaned_genes.loc[cleaned_genes["Chromosome"] == "1"]
 74 |     to_concat = []
 75 |     for chrom, dataframe in cleaned_genes.groupby(["Chromosome"]):
 76 |         for processed_dd_datum in processed_dd_data:
 77 |             if processed_dd_datum.unique_chromosome_id == chrom:
 78 |                 x = processed_dd_datum.add_hdf5_indices_to_gene_data(dataframe)
 79 |                 to_concat.append(x)
 80 |     gene_frame_with_indices = pd.concat(to_concat)
 81 | 
 82 |     # NOTE
 83 |     to_concat = []
 84 |     for chrom, dataframe in gene_frame_with_indices.groupby(["Chromosome"]):
 85 |         for processed_dd_datum in processed_dd_data:
 86 |             if processed_dd_datum.unique_chromosome_id == chrom:
 87 |                 x = processed_dd_datum.add_te_vals_to_gene_info_pandas(
 88 |                     dataframe, "Order", "LTR", "Upstream", 1000
 89 |                 )
 90 |                 to_concat.append(x)
 91 |     gene_frame_w_ind_te_vals = pd.concat(to_concat)
 92 | 
 93 |     upper_cutoff_val = np.percentile(gene_frame_w_ind_te_vals["LTR_1000_Upstream"], 99)
 94 |     lower_cutoff_val = np.percentile(gene_frame_w_ind_te_vals["LTR_1000_Upstream"], 1)
 95 | 
 96 |     genes_meeting_upper_cutoff = gene_frame_w_ind_te_vals.loc[
 97 |         gene_frame_w_ind_te_vals["LTR_1000_Upstream"] >= upper_cutoff_val
 98 |     ]["Gene_Name"].to_list()
 99 | 
100 |     upper_cutoff_len = len(genes_meeting_upper_cutoff)
101 | 
102 |     print(f"Upper Cutoff Length: {upper_cutoff_len}")
103 |     print(f"Upper Cutoff Val: {upper_cutoff_val}")
104 | 
105 |     print(lower_cutoff_val)
106 |     genes_meeting_lower_cutoff = gene_frame_w_ind_te_vals.loc[
107 |         gene_frame_w_ind_te_vals["LTR_1000_Upstream"] >= lower_cutoff_val
108 |     ]["Gene_Name"].to_list()
109 |     print(len(genes_meeting_lower_cutoff))  # NOTE this is 37741 genes when you
110 |     # faithfully apply the cutoff
111 | 
112 |     # Take random sample equal to the length of the gene array of upper values.
113 |     # Have to take a random sample because data not normally distributed and
114 |     # the cutoff value for the 1st percentile is so low you would actually get
115 |     # way too many genes if you actually applied it.
116 |     genes_meeting_lower_cutoff = np.random.choice(
117 |         gene_frame_w_ind_te_vals.loc[
118 |             gene_frame_w_ind_te_vals["LTR_1000_Upstream"] >= lower_cutoff_val
119 |         ]["Gene_Name"].to_list(),
120 |         upper_cutoff_len,
121 |         replace=False,
122 |     )
123 | 
124 |     # NOTE begin writing all the values
125 |     filename_to_write = os.path.join(
126 |         args.output_dir,
127 |         str("Upper_Sample_LTR_1000_Upstream" + ".tsv"),
128 |     )
129 |     with open(
130 |         filename_to_write,
131 |         "w",
132 |     ) as f_out:
133 | 
134 |         for gene in genes_meeting_upper_cutoff:
135 |             f_out.write(gene + "\n")
136 |     logger.info("Writing to: %s" % filename_to_write)
137 | 
138 |     filename_to_write = os.path.join(
139 |         args.output_dir,
140 |         str("Lower_Sample_LTR_1000_Upstream" + ".tsv"),
141 |     )
142 |     with open(
143 |         filename_to_write,
144 |         "w",
145 |     ) as f_out:
146 | 
147 |         for gene in genes_meeting_lower_cutoff:
148 |             f_out.write(gene + "\n")
149 |     logger.info("Writing to: %s" % filename_to_write)
150 | 


--------------------------------------------------------------------------------
/examples/Human/src/replace_human_TE_names.py:
--------------------------------------------------------------------------------
  1 | def te_annot_renamer(transposon_data):
  2 |     # TODO clean up all comments
  3 | 
  4 |     print()
  5 |     print(transposon_data.shape)
  6 |     print(transposon_data["Order"].unique())
  7 |     print(transposon_data["SuperFamily"].unique())
  8 |     print()
  9 |     # transposon_data.SuperFamily.fillna(
 10 |     # value="Unknown_SuperFam", inplace=True
 11 |     # )  # replace None w U
 12 |     # step to fix TE names
 13 |     # transposon_data.Order.replace(master_order, inplace=True)
 14 |     # transposon_data.SuperFamily.replace(master_superfamily, inplace=True)
 15 |     # transposon_data.loc[transposon_data.Order == "Tandem", "SuperFamily"] = "Tandem"
 16 | 
 17 |     #################################################
 18 |     # Make Penelope its own Order called PLE, make the SuperFamily Penelope
 19 |     # Corresponds to Wicker's grouping of Penelope elements
 20 |     transposon_data.loc[
 21 |         (transposon_data["Order"] == "LINE")
 22 |         & (transposon_data["SuperFamily"] == "Penelope"),
 23 |         ["Order"],
 24 |     ] = "PLE"
 25 | 
 26 |     # Drop CR1 elements from dataset, not in Wicker
 27 |     # transposon_data = transposon_data[transposon_data.SuperFamily != "CR1"]
 28 | 
 29 |     # Drop Dong-R4 elements from dataset, not in Wicker
 30 |     # transposon_data = transposon_data[transposon_data.SuperFamily != "Dong-R4"]
 31 | 
 32 |     # Drop all RNA related TE Orders
 33 |     transposon_data = transposon_data[transposon_data.Order != "snRNA"]
 34 |     transposon_data = transposon_data[transposon_data.Order != "tRNA"]
 35 |     transposon_data = transposon_data[transposon_data.Order != "rRNA"]
 36 |     transposon_data = transposon_data[transposon_data.Order != "rRNA"]
 37 |     transposon_data = transposon_data[transposon_data.Order != "srpRNA"]
 38 |     transposon_data = transposon_data[transposon_data.Order != "scRNA"]
 39 | 
 40 |     # Remove ERV
 41 |     transposon_data = transposon_data[transposon_data.SuperFamily != "ERVL-maLR"]
 42 |     transposon_data = transposon_data[transposon_data.SuperFamily != "ERV1"]
 43 |     transposon_data = transposon_data[transposon_data.SuperFamily != "ERVL"]
 44 |     transposon_data = transposon_data[transposon_data.SuperFamily != "ERVK"]
 45 |     transposon_data = transposon_data[transposon_data.SuperFamily != "ERV1?"]
 46 |     transposon_data = transposon_data[transposon_data.SuperFamily != "ERVL?"]
 47 |     # Remove low confidence and other minor/misc things
 48 |     transposon_data = transposon_data[transposon_data.Order != "LTR?"]
 49 |     transposon_data = transposon_data[transposon_data.Order != "SINE?"]
 50 |     transposon_data = transposon_data[transposon_data.Order != "RC?"]
 51 |     transposon_data = transposon_data[transposon_data.Order != "DNA?"]
 52 |     transposon_data = transposon_data[transposon_data.Order != "RNA"]
 53 |     transposon_data = transposon_data[transposon_data.Order != "Low_complexity"]
 54 |     transposon_data = transposon_data[transposon_data.Order != "RC"]
 55 |     transposon_data = transposon_data[transposon_data.Order != "Satellite"]
 56 |     transposon_data = transposon_data[transposon_data.Order != "Simple_repeat"]
 57 |     transposon_data = transposon_data[transposon_data.SuperFamily != "PiggyBac?"]
 58 |     transposon_data = transposon_data[transposon_data.SuperFamily != "Gypsy?"]
 59 |     transposon_data = transposon_data[transposon_data.SuperFamily != "TcMar?"]
 60 |     transposon_data = transposon_data[transposon_data.SuperFamily != "hAT?"]
 61 |     transposon_data = transposon_data[transposon_data.SuperFamily != "hAT-Tip100?"]
 62 | 
 63 |     # Remove elements that are DNA as superfamily
 64 |     transposon_data = transposon_data[transposon_data.SuperFamily != "DNA"]
 65 | 
 66 |     # Rename DNA order to TIR
 67 |     transposon_data.loc[(transposon_data["Order"] == "DNA"), ["Order"]] = "TIR"
 68 | 
 69 |     ###################################################3
 70 |     # to_drop = transposon_data.Chromosome.str.contains('##sequence-region')
 71 |     # transposon_data = transposon_data[~to_drop]
 72 |     # to_drop = transposon_data.Chromosome.str.contains('contig*')
 73 |     # transposon_data = transposon_data[~to_drop]
 74 | 
 75 |     # transposon_data.loc[
 76 |     # (transposon_data["Order"] == "Unknown_Order")
 77 |     # & (transposon_data["SuperFamily"] == "Unknown_SuperFam"),
 78 |     # ["Order", "SuperFamily"],
 79 |     # ] = "Completely_Unknown"
 80 | 
 81 |     # transposon_data.loc[
 82 |     # (transposon_data["Order"] == "Helitron")
 83 |     # & (transposon_data["SuperFamily"] == "Unknown_SuperFam"),
 84 |     # ["SuperFamily"],
 85 |     # ] = "Helitron"
 86 |     # transposon_data.loc[(transposon_data["SuperFamily"] == "Helitron"), ["Order"]] = "Helitron"
 87 |     # transposon_data.loc[(transposon_data["Order"] == "Mixture"), ["SuperFamily"]] = "Mixture"
 88 | 
 89 |     # ltr_elements = ["Copia", "Gypsy"]
 90 |     # transposon_data.loc[
 91 |     # (transposon_data["Order"] == "LTR") & (~transposon_data["SuperFamily"].isin(ltr_elements)),
 92 |     # ["SuperFamily"],
 93 |     # ] = "Unknown_LTR_Superfam"
 94 | 
 95 |     # transposon_data = transposon_data[transposon_data.Order != "long_terminal_repeat"]  # drop
 96 |     # transposon_data = transposon_data[transposon_data.Order != "Maverick"]  # drop if in Order category
 97 |     # transposon_data = transposon_data[
 98 |     # transposon_data.Order != "target_site_duplication"
 99 |     # ]  # drop if in Order category
100 |     # print(transposon_data[transposon_data["Order"] == "LINE"]["SuperFamily"].unique())
101 | 
102 |     # If TE Order is LINE and the SuperFamily is not in Wicker 2007
103 |     # No CR1 superfamily in Fig 1
104 |     # print(transposon_data[transposon_data["SuperFamily"] == "CR1"])
105 |     print()
106 |     print(transposon_data.shape)
107 |     print(transposon_data["Order"].unique())
108 |     print(transposon_data["SuperFamily"].unique())
109 |     print()
110 | 
111 |     return transposon_data
112 | 


--------------------------------------------------------------------------------
/examples/general_read_density_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env/python
  2 | 
  3 | """
  4 | Barebones initialization of DensityData class intended for demonstration
  5 | purposes.
  6 | """
  7 | 
  8 | __author__ = "Scott Teresi"
  9 | 
 10 | import pandas as pd
 11 | import numpy as np
 12 | import os
 13 | import argparse
 14 | import logging
 15 | import coloredlogs
 16 | 
 17 | from transposon.import_filtered_genes import import_filtered_genes
 18 | from transposon.gene_data import GeneData
 19 | from transposon.density_data import DensityData
 20 | 
 21 | from transposon.density_utils import (
 22 |     add_hdf5_indices_to_gene_data_from_list_hdf5,
 23 |     add_te_vals_to_gene_info_pandas_from_list_hdf5,
 24 |     add_te_vals_to_gene_info_pandas,
 25 |     get_specific_slice,
 26 |     add_hdf5_indices_to_gene_data,
 27 |     info_of_gene,
 28 | )
 29 | 
 30 | 
 31 | def get_gene_data_as_list(cleaned_genes):
 32 |     """
 33 |     Take a cleaned genes annotation file from TE Density
 34 |     (import_filtered_genes) and break it into a list of GeneData objects by
 35 |     chromosome ID. This is used to initialize all of the DensityData objects in
 36 |     a list.
 37 | 
 38 |     Args:
 39 |         cleaned_genes (pandas.DataFrame)
 40 |             Index:
 41 |                 Name: Gene_Name, strings of gene names
 42 |             Columns:
 43 |                 Name: Chromosome, object
 44 |                 Name: Feature, object
 45 |                 Name: Start, float64
 46 |                 Name: Stop, float64
 47 |                 Name: Strand, object
 48 |                 Name: Length, float64
 49 | 
 50 |     Returns:
 51 |         genedata_list (list of GeneData)
 52 |     """
 53 |     # MAGIC group by column Chromosome
 54 |     gene_dataframe_list = [
 55 |         dataframe for k, dataframe in cleaned_genes.groupby("Chromosome")
 56 |     ]
 57 | 
 58 |     # MAGIC initialize GeneData iteratively using the magic unique chromosome
 59 |     genedata_list = [
 60 |         GeneData(dataframe, dataframe["Chromosome"].unique()[0])
 61 |         for dataframe in gene_dataframe_list
 62 |     ]
 63 |     return genedata_list
 64 | 
 65 | 
 66 | if __name__ == "__main__":
 67 |     parser = argparse.ArgumentParser(description="start to analyze TE Density data")
 68 | 
 69 |     parser.add_argument(
 70 |         "cleaned_gene_annotation",
 71 |         type=str,
 72 |         help="path to your cleaned gene annotation (.tsv)",
 73 |     )
 74 | 
 75 |     parser.add_argument(
 76 |         "density_data_dir",
 77 |         type=str,
 78 |         help="Parent path of folder containing ONLY the TE Density results",
 79 |     )
 80 | 
 81 |     parser.add_argument(
 82 |         "chromosome_string",
 83 |         type=str,
 84 |         help="regex for chromosome ID, e.g. 'GenomeName_(.*?).h5'",
 85 |     )
 86 |     # NOTE users will need to edit this as needed!
 87 |     # This is the regular expression that is used to extract the chromosome IDs
 88 |     # and initialize the DensityData objects. This is specific to the naming of
 89 |     # your chromosomes in the HDF5 files.
 90 |     # Please consult the docstring of from_list_gene_data_and_hdf5_dir in
 91 |     # density_data.py for more information, typically you will only need to
 92 |     # edit the part of the string before the underscore. Here this was specific
 93 |     # For example I was working on a genome with the name DN, so my regex rule
 94 |     # was "DN_(.*?).h5"
 95 | 
 96 |     parser.add_argument(
 97 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
 98 |     )
 99 |     args = parser.parse_args()
100 |     args.cleaned_gene_annotation = os.path.abspath(args.cleaned_gene_annotation)
101 |     args.density_data_dir = os.path.abspath(args.density_data_dir)
102 | 
103 |     # NB just for logging arguments to import_filtered command and DensityData
104 |     # initialization
105 |     log_level = logging.DEBUG if args.verbose else logging.INFO
106 |     logger = logging.getLogger(__name__)
107 |     coloredlogs.install(level=log_level)
108 | 
109 |     # --------------------------------------------------
110 |     # Read cleaned genes for the given genome as pandas
111 |     cleaned_genes = import_filtered_genes(args.cleaned_gene_annotation, logger)
112 | 
113 |     # Get list of GeneData for each genome to enable initialization of
114 |     # DensityData
115 |     genedata_list = get_gene_data_as_list(cleaned_genes)
116 | 
117 |     # Initialize DensityData for each genome
118 |     # NOTE this object is a list of DensityData instances
119 |     processed_dd_data = DensityData.from_list_gene_data_and_hdf5_dir(
120 |         genedata_list, args.density_data_dir, args.chromosome_string, logger
121 |     )
122 | 
123 |     gene_frame_with_indices = add_hdf5_indices_to_gene_data_from_list_hdf5(
124 |         cleaned_genes, processed_dd_data
125 |     )
126 | 
127 |     # NOTE, this adds the indices of the genes in the HDF5 datasets to a pandas
128 |     # dataframe, this is used later on to access the density data for each gene
129 |     gene_frame_with_hdf5_indices = add_hdf5_indices_to_gene_data_from_list_hdf5(
130 |         cleaned_genes, processed_dd_data
131 |     )
132 | 
133 |     # NOTE, now we can add columns to the pandas dataframe which are the
134 |     # density values for a specific TE type and window combo. This right here
135 |     # adds a column called "LTR_500_Upstream" to the pandas dataframe, this
136 |     # column represents the TE density values for each gene, for the 500 bp
137 |     # upstream window. 500 bp window is used here because I use a smaller
138 |     # testing set of windows for development.
139 | 
140 |     # NOTE users can loop this over a list of TE types and window sizes if they
141 |     # want
142 |     gene_frame_with_te_vals_of_interest = (
143 |         add_te_vals_to_gene_info_pandas_from_list_hdf5(
144 |             gene_frame_with_hdf5_indices,
145 |             processed_dd_data,
146 |             "Order",
147 |             "LTR",
148 |             "Upstream",
149 |             500,
150 |         )
151 |     )
152 |     print(gene_frame_with_te_vals_of_interest)
153 | 


--------------------------------------------------------------------------------
/examples/Rice_Synteny/src/import_rice_gene_anno.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Filter a gene annotation file for the TE Density algorithm
  3 | """
  4 | 
  5 | __author__ = "Scott Teresi"
  6 | 
  7 | import pandas as pd
  8 | import argparse
  9 | import os
 10 | import logging
 11 | import coloredlogs
 12 | 
 13 | from transposon import check_nulls
 14 | 
 15 | 
 16 | def write_cleaned_genes(gene_pandaframe, output_dir, old_filename, logger):
 17 |     file_name = os.path.join(
 18 |         output_dir,
 19 |         ("Cleaned_" + os.path.splitext(os.path.basename(old_filename))[0]) + ".tsv",
 20 |     )  # MAGIC to get proper extension
 21 | 
 22 |     logger.info("Writing cleaned gene file to: %s" % file_name)
 23 |     gene_pandaframe.to_csv(file_name, sep="\t", header=True, index=True)
 24 | 
 25 | 
 26 | def import_genes(genes_input_path, logger):
 27 |     """Import genes file.
 28 | 
 29 |     Args:
 30 |         input_dir (command line argument) Specify the input directory of the gene
 31 |         annotation data, this is the same as the TE annotation directory
 32 | 
 33 |     Returns:
 34 |         gene_pandaframe (Pandas.Data.Frame): A pandas dataframe of a filtered
 35 |         GFF file containing hte information needed for the TE Density pipeline.
 36 |     """
 37 | 
 38 |     col_names = [
 39 |         "Chromosome",
 40 |         "Software",
 41 |         "Feature",
 42 |         "Start",
 43 |         "Stop",
 44 |         "Score",
 45 |         "Strand",
 46 |         "Frame",
 47 |         "FullName",
 48 |     ]
 49 | 
 50 |     col_to_use = [
 51 |         "Chromosome",
 52 |         "Software",
 53 |         "Feature",
 54 |         "Start",
 55 |         "Stop",
 56 |         "Strand",
 57 |         "FullName",
 58 |     ]
 59 | 
 60 |     gene_pandaframe = pd.read_csv(
 61 |         genes_input_path,
 62 |         sep="\t+",
 63 |         header=None,
 64 |         engine="python",
 65 |         names=col_names,
 66 |         usecols=col_to_use,
 67 |         dtype={
 68 |             "Stop": "float64",
 69 |             "Start": "float64",
 70 |             "Chromosome": str,
 71 |             "Strand": str,
 72 |             "Fullname": str,
 73 |             "Feature": str,
 74 |             "Software": str,
 75 |         },
 76 |         comment="#",
 77 |     )
 78 | 
 79 |     # rows in annotation
 80 |     gene_pandaframe = gene_pandaframe[
 81 |         gene_pandaframe.Feature == "gene"
 82 |     ]  # drop non-gene rows
 83 |     # gene_pandaframe.reset_index(inplace=True)  # reset index so we can have proper
 84 | 
 85 |     gene_pandaframe["Gene_Name"] = gene_pandaframe["FullName"].str.extract(
 86 |         r"gene_id=(.*?);"
 87 |     )
 88 |     gene_pandaframe = gene_pandaframe.drop(columns=["FullName", "Software"])
 89 |     gene_pandaframe["Length"] = gene_pandaframe.Stop - gene_pandaframe.Start + 1
 90 | 
 91 |     gene_pandaframe.sort_values(by=["Chromosome", "Start"], inplace=True)
 92 |     # MAGIC I only want the first 12 chromosomes
 93 |     chromosomes_i_want = [str(i) for i in range(1, 12 + 1)]  # MAGIC plus 1 bc range
 94 |     # NB, chromosomes_i_want must be string
 95 |     gene_pandaframe = gene_pandaframe.loc[
 96 |         gene_pandaframe["Chromosome"].isin(chromosomes_i_want)
 97 |     ]
 98 |     gene_pandaframe.sort_values(by=["Chromosome", "Start"], inplace=True)
 99 | 
100 |     check_nulls(gene_pandaframe, logger)
101 |     gene_pandaframe = drop_nulls(gene_pandaframe, logger)
102 |     # NB
103 |     # Throwing away 1 gene because its name is ridiculous and difficult to
104 |     # import because of extraneous strings in the entry
105 |     # There are '#' characters in the last column of the GFF for a minority of
106 |     # genes in both genomes. Will not be modifying the 'comment=#' of the read
107 |     # command because that will remove the easy ability to remove the comment
108 |     # rows from the annotation. A fix could be performed by removing the #
109 |     # characters through another method, but I will just remove the offending
110 |     # genes because they aren't central to the example.
111 | 
112 |     # Set the gene name as the index
113 |     gene_pandaframe.set_index("Gene_Name", inplace=True)
114 |     return gene_pandaframe
115 | 
116 | 
117 | def get_nulls(my_df, logger):
118 |     """
119 |     Print out the row IDs where the null values exist
120 | 
121 |     Args:
122 |         my_df (Pandaframes): Pandaframe to check null values in
123 |         logger
124 |     """
125 |     nas = my_df[my_df.isna().any(axis=1)]
126 |     logger.warning("Rows where null exist: %s" % nas)
127 | 
128 | 
129 | def drop_nulls(my_df, logger):
130 |     """
131 |     Drop null values inside a Pandaframe
132 | 
133 |     Args:
134 |         my_df (Pandaframes): Pandaframe to drop null values
135 |     """
136 |     nas = my_df[my_df.isna().any(axis=1)]
137 |     if not nas.empty:
138 |         logger.warning("Dropping rows with at least one Null value!")
139 |         my_df = my_df.dropna(axis=0, how="any")
140 |     return my_df
141 | 
142 | 
143 | if __name__ == "__main__":
144 | 
145 |     parser = argparse.ArgumentParser(description="Reformat gene annotation file")
146 |     path_main = os.path.abspath(__file__)
147 |     dir_main = os.path.dirname(path_main)
148 |     output_default = os.path.join(
149 |         dir_main, "../../../../", "TE_Data/filtered_input_data"
150 |     )
151 |     parser.add_argument(
152 |         "gene_input_file", type=str, help="Parent path of gene annotation file"
153 |     )
154 | 
155 |     parser.add_argument(
156 |         "--output_dir",
157 |         "-o",
158 |         type=str,
159 |         default=output_default,
160 |         help="Parent directory to output results",
161 |     )
162 | 
163 |     parser.add_argument(
164 |         "-v", "--verbose", action="store_true", help="set debugging level to DEBUG"
165 |     )
166 | 
167 |     args = parser.parse_args()
168 |     args.gene_input_file = os.path.abspath(args.gene_input_file)
169 |     args.output_dir = os.path.abspath(args.output_dir)
170 | 
171 |     log_level = logging.DEBUG if args.verbose else logging.INFO
172 |     logger = logging.getLogger(__name__)
173 |     coloredlogs.install(level=log_level)
174 | 
175 |     # Execute
176 |     cleaned_genes = import_genes(args.gene_input_file, logger)
177 |     write_cleaned_genes(cleaned_genes, args.output_dir, args.gene_input_file, logger)
178 | 


--------------------------------------------------------------------------------
/tests/unit/test_OverlapData.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Unit test OverlapData.
  5 | 
  6 | OverlapData can be used in two modes: outut or input file, SEE `from_param`, `from_file`.
  7 | Tests are grouped accordingly.
  8 | """
  9 | 
 10 | __author__ = "Michael Teresi"
 11 | 
 12 | import logging
 13 | import os
 14 | import pytest
 15 | import tempfile
 16 | 
 17 | import coloredlogs
 18 | import numpy as np
 19 | import pandas as pd
 20 | 
 21 | from transposon.gene_data import GeneData
 22 | from transposon.overlap import OverlapData
 23 | 
 24 | N_TRANSPOSONS = 4
 25 | WINDOWS = [10, 20]
 26 | LOGGER = logging.getLogger(__name__)
 27 | coloredlogs.install(level=logging.DEBUG)
 28 | 
 29 | 
 30 | @pytest.fixture
 31 | def gene_data():
 32 |     """Default GeneData instance."""
 33 | 
 34 |     return GeneData.mock()
 35 | 
 36 | @pytest.fixture
 37 | def temp_dir():
 38 |     """Temporary directory."""
 39 | 
 40 |     with tempfile.TemporaryDirectory() as temp_dir:
 41 |         yield temp_dir
 42 | 
 43 | 
 44 | @pytest.fixture
 45 | def temp_file():
 46 |     """Temporary directory."""
 47 | 
 48 |     with tempfile.NamedTemporaryFile(suffix="."+OverlapData.EXT) as temp_file:
 49 |         yield temp_file.name
 50 | 
 51 | @pytest.fixture
 52 | def default_data_out(temp_file):
 53 |     """Return default output OverlapData instance."""
 54 | 
 55 |     return OverlapData.from_param(
 56 |         GeneData.mock(), N_TRANSPOSONS, WINDOWS, temp_file, logger=LOGGER
 57 |     )
 58 | 
 59 | @pytest.fixture
 60 | def active_output(default_data_out):
 61 |     """Default OverlapData instance for writing."""
 62 | 
 63 |     with default_data_out as active_output:
 64 |         yield active_output
 65 | 
 66 | @pytest.fixture
 67 | def active_input(default_data_out):
 68 |     """Default OverlapData instance for reading."""
 69 | 
 70 |     filepath = None
 71 |     with default_data_out as io:
 72 |         filepath = io.filepath
 73 |     with OverlapData.from_file(filepath) as io:
 74 |         yield io
 75 | 
 76 | @pytest.fixture
 77 | def serialized_deserialized(default_data_out):
 78 |     """Yield an overlap written to disk and one read from the first."""
 79 | 
 80 |     filepath = None
 81 |     with default_data_out as output:
 82 |         # MAGIC NUMBER dummy data
 83 |         output.left[:] = np.ones(output.left.shape) * 2
 84 |         output.right[:] = np.ones(output.right.shape) * 3
 85 |         output.intra[:] = np.ones(output.intra.shape) * 4
 86 |         output._h5_file.flush()
 87 |         filepath = output.filepath
 88 |         with OverlapData.from_file(filepath) as input:
 89 |             yield (input, output)
 90 | 
 91 | def test_from_param_raise(gene_data, temp_file):
 92 |     """Does the from param factory raise?"""
 93 | 
 94 |     OverlapData.from_param(gene_data, N_TRANSPOSONS, WINDOWS, temp_file, logger=LOGGER)
 95 | 
 96 | def test_from_param_raise_enter_exit(active_output):
 97 |     """Does the context manager raise for an output file?"""
 98 | 
 99 |     pass
100 | 
101 | def test_open_dispatch_bad(temp_dir):
102 |     """Does the open dispatch raise on an invalid config?"""
103 | 
104 |     class DummyClass():
105 |         pass
106 |     od = OverlapData(DummyClass())
107 |     with pytest.raises(TypeError) as excinfo:
108 |         od._open_dispatcher()
109 | 
110 | def test_open_dispatch_sink(active_output):
111 |     """Does the open dispatch create a file?"""
112 | 
113 |     assert os.path.isfile(active_output._h5_file.filename)
114 | 
115 | def _test_open_dispatch_source(temp_dir):
116 |     """Does the open dispatch call the source initializer?"""
117 | 
118 |     raise NotImplementedError()
119 | 
120 | def test_from_file_raise_valid(default_data_out):
121 |     """Does the from file factory raise for valid data?"""
122 | 
123 |     filepath = None
124 |     with default_data_out as io:
125 |         filepath = io.filepath
126 |     overlap_data = OverlapData.from_file(filepath, LOGGER)
127 | 
128 | def test_from_file_raise(temp_dir):
129 |     """Does the from file factory raise for invalid data?"""
130 | 
131 |     with pytest.raises(ValueError) as excinfo:
132 |         overlap_data = OverlapData.from_file('not a file')
133 | 
134 | def test_from_file_raise_enter_exit(default_data_out):
135 |     """Does the context manager raise?"""
136 | 
137 |     filepath = None
138 |     with default_data_out as io:
139 |         filepath = io.filepath
140 |     with OverlapData.from_file(filepath) as io:
141 |         assert io.filepath == filepath
142 | 
143 | def test_left_right_shape(active_output):
144 |     """Do the shapes match for left / right overlap?"""
145 | 
146 |     assert np.all(active_output.left.shape == active_output.right.shape)
147 | 
148 | def test_io_windows(serialized_deserialized):
149 |     """Can it serialize / deserialize the windows?"""
150 | 
151 |     input, output = serialized_deserialized
152 |     assert input.windows == output.windows
153 | 
154 | def test_io_genes(serialized_deserialized):
155 |     """Can it serialize / deserialize the gene names?"""
156 | 
157 |     input, output = serialized_deserialized
158 |     assert input.gene_names == output.gene_names
159 | 
160 | def test_io_chromosome_id(serialized_deserialized):
161 |     """Can it serialize / deserialize the chromosome id?"""
162 | 
163 |     input, output = serialized_deserialized
164 |     assert input.chromosome_id == output.chromosome_id
165 | 
166 | def test_io_chromosome_id(serialized_deserialized):
167 |     """Can it serialize / deserialize the chromosome id?"""
168 | 
169 |     input, output = serialized_deserialized
170 |     assert input.genome_id == output.genome_id
171 | 
172 | def test_io_left(serialized_deserialized):
173 |     """Can it serialize / deserialize the left overlap"""
174 | 
175 |     input, output = serialized_deserialized
176 |     assert np.all(input.left == output.left)
177 | 
178 | def test_io_intra(serialized_deserialized):
179 |     """Can it serialize / deserialize the intra overlap?"""
180 | 
181 |     input, output = serialized_deserialized
182 |     assert np.all(input.intra == output.intra)
183 | 
184 | def test_io_right(serialized_deserialized):
185 |     """Can it serialize / deserialize the right overlap?"""
186 | 
187 |     input, output = serialized_deserialized
188 |     assert np.all(input.right == output.right)
189 | 
190 | # TODO test slicing
191 | 
192 | if __name__ == "__main__":
193 |     pytest.main(['-s', __file__])  # for convenience
194 | 


--------------------------------------------------------------------------------
/tests/unit/test_DensityData.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env/python
  2 | 
  3 | """
  4 | Unit test DensityData
  5 | """
  6 | 
  7 | __author__ = "Scott Teresi"
  8 | 
  9 | import h5py
 10 | import numpy as np
 11 | import pytest
 12 | import logging
 13 | import coloredlogs
 14 | 
 15 | from transposon import write_vlen_str_h5py
 16 | from transposon.density_data import DensityData
 17 | from transposon.import_filtered_genes import import_filtered_genes
 18 | from transposon.gene_data import GeneData
 19 | 
 20 | LOGGER = logging.getLogger(__name__)
 21 | coloredlogs.install(level=logging.DEBUG)
 22 | TEST_GENE_FILE = "tests/input_data/Test_Genes_DensityData.tsv"
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def chromosomes():
 27 |     """Return a list of chromosomes for h5py file"""
 28 |     return ["TestChrom1"]
 29 | 
 30 | 
 31 | @pytest.fixture
 32 | def gene_names():
 33 |     """Yield a list of genes"""
 34 |     gene_names = [
 35 |         "dummy1",
 36 |         "dummy2",
 37 |         "dummy3",
 38 |         "dummy4",
 39 |         "dummy5",
 40 |         "dummy6",
 41 |         "dummy7",
 42 |         "dummy8",
 43 |     ]
 44 |     return gene_names
 45 | 
 46 | 
 47 | @pytest.fixture
 48 | def dummy_gene_data():
 49 |     """Yield a GeneData for testing"""
 50 |     gene_pandas = import_filtered_genes(TEST_GENE_FILE, LOGGER)
 51 |     return GeneData(gene_pandas, "dummy_genome_name", LOGGER)
 52 | 
 53 | 
 54 | @pytest.fixture
 55 | def order_names():
 56 |     """Return a list of order types"""
 57 |     order_names = ["LTR", "TIR"]
 58 |     return order_names
 59 | 
 60 | 
 61 | @pytest.fixture
 62 | def super_names():
 63 |     """Return a list of super names"""
 64 |     super_names = ["Copia", "Gypsy", "HAT"]
 65 |     return super_names
 66 | 
 67 | 
 68 | @pytest.fixture
 69 | def windows():
 70 |     """Return a list of window values"""
 71 |     windows = ["500", "1000", "1500"]
 72 |     return windows
 73 | 
 74 | 
 75 | @pytest.fixture
 76 | def total_orders(order_names):
 77 |     """Return an integer which is the total number of orders"""
 78 |     return sum(1 for order in order_names)
 79 | 
 80 | 
 81 | @pytest.fixture
 82 | def total_windows(windows):
 83 |     """Return an integer which is the total number of windows"""
 84 |     return sum(1 for window in windows)
 85 | 
 86 | 
 87 | @pytest.fixture
 88 | def total_genes(gene_names):
 89 |     """Return an integer which is the total number of genes"""
 90 |     return sum(1 for gene in gene_names)
 91 | 
 92 | 
 93 | @pytest.fixture
 94 | def rho_o_left(total_orders, total_windows, total_genes):
 95 |     """Return an array of order left values"""
 96 |     # NOTE currently it will perform np.arange(48) and reshape to (2,3,8)
 97 |     matrix_num = total_orders * total_windows * total_genes
 98 |     arr = np.arange(matrix_num).reshape((total_orders, total_windows, total_genes))
 99 |     return arr
100 | 
101 | 
102 | @pytest.fixture
103 | def rho_o_intra(total_orders, total_genes):
104 |     """Return an array of order left values"""
105 |     # NOTE currently it will perform np.arange(48)
106 |     matrix_num = total_orders * total_genes
107 |     arr = np.arange(matrix_num).reshape((total_orders, 1, total_genes))
108 |     return arr
109 | 
110 | 
111 | @pytest.fixture
112 | def rho_o_right(total_orders, total_windows, total_genes):
113 |     """Return an array of order left values"""
114 |     # NOTE currently it will perform np.arange(48) *2 starting from 48
115 |     # so that we may differentiate from rho o left, and reshape to (2,3,8)
116 |     # All this does is make the values different, not the shape
117 |     matrix_num = total_orders * total_windows * total_genes * 2
118 |     arr = np.arange(48, matrix_num).reshape((total_orders, total_windows, total_genes))
119 |     return arr
120 | 
121 | 
122 | @pytest.fixture
123 | def density_data_test_obj_swap_vals(
124 |     chromosomes,
125 |     dummy_gene_data,
126 |     order_names,
127 |     super_names,
128 |     windows,
129 |     rho_o_left,
130 |     rho_o_intra,
131 |     rho_o_right,
132 | ):
133 |     """Create a test object for DensityData, reads from file"""
134 |     # TODO set path to be a variable not hard-coded, plus it repeats further
135 |     # down
136 |     f = h5py.File(
137 |         "tests/input_data/test_swap_file.h5",
138 |         "w",
139 |     )
140 |     gene_names = list(dummy_gene_data.names)
141 |     write_vlen_str_h5py(f, chromosomes, "CHROMOSOME_ID")
142 |     write_vlen_str_h5py(f, gene_names, "GENE_NAMES")
143 |     write_vlen_str_h5py(f, order_names, "ORDER_NAMES")
144 |     write_vlen_str_h5py(f, super_names, "SUPERFAMILY_NAMES")
145 |     write_vlen_str_h5py(f, windows, "WINDOWS")
146 | 
147 |     dset = f.create_dataset("RHO_ORDERS_LEFT", data=rho_o_left)
148 |     dset = f.create_dataset("RHO_ORDERS_INTRA", data=rho_o_intra)
149 |     dset = f.create_dataset("RHO_ORDERS_RIGHT", data=rho_o_right)
150 | 
151 |     # NB just re-doing the values for the supers because not testing supers
152 |     dset = f.create_dataset("RHO_SUPERFAMILIES_LEFT", data=rho_o_left)
153 |     dset = f.create_dataset("RHO_SUPERFAMILIES_INTRA", data=rho_o_intra)
154 |     f.create_dataset("RHO_SUPERFAMILIES_RIGHT", data=rho_o_right)
155 |     f.close()
156 |     return DensityData(
157 |         "tests/input_data/test_swap_file.h5",
158 |         dummy_gene_data,
159 |         LOGGER,
160 |         sense_swap=False,
161 |     )
162 | 
163 | 
164 | def test_swap_density_vals(density_data_test_obj_swap_vals):
165 |     """Test whether or not left and right density values are swapped
166 |     correctly"""
167 |     # Set values for easy testing and checking
168 |     # Shape of left/right orders is (no. of TE types, window, no. genes).
169 |     # Here we do index 1, which corresponds to Gene2
170 | 
171 |     density_data_test_obj_swap_vals.left_orders[:, :, 1] = 100
172 |     density_data_test_obj_swap_vals.right_orders[:, :, 1] = 200
173 | 
174 |     # Call the value swapper, manually, because it was set to NOT do it upon
175 |     # initialization
176 |     density_data_test_obj_swap_vals._swap_strand_vals(["dummy2"])
177 | 
178 |     # Check the values
179 |     assert np.array_equal(
180 |         density_data_test_obj_swap_vals.left_orders[:, :, 1], np.full((2, 3), 200)
181 |     )
182 |     assert np.array_equal(
183 |         density_data_test_obj_swap_vals.right_orders[:, :, 1], np.full((2, 3), 100)
184 |     )
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     pytest.main(["-s", __file__])  # for convenience
189 | 


--------------------------------------------------------------------------------
/transposon/gene_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Wrappers for input data, multiple genes.
  5 | 
  6 | Used to provide a common interface and fast calculations with numpy.
  7 | """
  8 | 
  9 | __author__ = "Michael Teresi, Scott Teresi"
 10 | 
 11 | import logging
 12 | import numpy as np
 13 | import pandas as pd
 14 | 
 15 | from transposon.gene_datum import GeneDatum
 16 | 
 17 | 
 18 | class GeneData(object):
 19 |     """Wraps a data frame containing many genes.
 20 |     Provides an interface, attribute access, and to/from disk functionality.
 21 | 
 22 |     Note the numpy views are not necessarily a no-copy (SEE pandas.DataFrame.to_numpy).
 23 | 
 24 |     Expects certain column identifiers (SEE self.__init__).
 25 |     GeneData subclasses should conform to these column names or redefine the properties.
 26 |     """
 27 | 
 28 |     def __init__(self, gene_dataframe, genome_id, logger=None):
 29 |         """Initialize.
 30 | 
 31 |         Args:
 32 |             gene_dataframe (DataFrame): gene data frame.
 33 |             genome_id (str): a string of the genome name.
 34 |         """
 35 | 
 36 |         self._logger = logger or logging.getLogger(__name__)
 37 |         self.data_frame = gene_dataframe.copy(deep=True)
 38 |         self._names = self.data_frame.index  # names of genes list(str)
 39 |         self.starts = self.data_frame.Start.to_numpy(copy=False)
 40 |         self.stops = self.data_frame.Stop.to_numpy(copy=False)
 41 |         self.lengths = self.data_frame.Length.to_numpy(copy=False)
 42 |         self.chromosomes = self.data_frame.Chromosome.to_numpy(copy=False)
 43 |         self.genome_id = genome_id
 44 |         self.add_genome_id()
 45 | 
 46 |     @classmethod
 47 |     def mock(
 48 |         cls,
 49 |         start_stop=np.array([[0, 9], [10, 19], [20, 29]]),
 50 |         genome_id="fake_genome_id",
 51 |     ):
 52 |         """Mocked data for testing.
 53 | 
 54 |         Args:
 55 |             start_stop (numpy.array): N gene x (start_idx, stop_idx)
 56 |         """
 57 | 
 58 |         n_genes = start_stop.shape[0]
 59 |         data = []
 60 |         for gi in range(n_genes):
 61 |             g0 = start_stop[gi, 0]
 62 |             g1 = start_stop[gi, 1]
 63 |             gL = g1 - g0 + 1
 64 |             name = "gene_{}".format(gi)
 65 |             chromosome = genome_id
 66 |             datum = [name, g0, g1, gL, chromosome]
 67 |             data.append(datum)
 68 | 
 69 |         col_names = ["Gene_Name", "Start", "Stop", "Length", "Chromosome"]
 70 |         frame = pd.DataFrame(data, columns=col_names)
 71 |         frame.set_index("Gene_Name", inplace=True)
 72 |         return GeneData(frame, genome_id)
 73 | 
 74 |     @classmethod
 75 |     def mock_v2(
 76 |         cls,
 77 |         start_stop=np.array([[500, 1000], [800, 1500], [1600, 2000]]),
 78 |         chromosome_ids=["Chrom_1", "Chrom_2", "Chrom_3"],
 79 |         genome_id="fake_genome_id",
 80 |     ):
 81 |         """Mocked data for testing. TODO refactor with main mock.
 82 | 
 83 |         Args:
 84 |             start_stop (numpy.array): N gene x (start_idx, stop_idx)
 85 |             chromosome_id (list): List of string, len(N gene)
 86 |         """
 87 | 
 88 |         n_genes = start_stop.shape[0]
 89 |         data = []
 90 |         for gi in range(n_genes):
 91 |             g0 = start_stop[gi, 0]
 92 |             g1 = start_stop[gi, 1]
 93 |             gL = g1 - g0 + 1
 94 |             name = "gene_{}".format(gi)
 95 |             chromosome_id = chromosome_ids[gi]
 96 |             datum = [name, g0, g1, gL, chromosome_id]
 97 |             data.append(datum)
 98 | 
 99 |         col_names = ["Gene_Name", "Start", "Stop", "Length", "Chromosome"]
100 |         frame = pd.DataFrame(data, columns=col_names)
101 |         frame.set_index("Gene_Name", inplace=True)
102 |         return GeneData(frame, genome_id)
103 | 
104 |     def write(self, filename):
105 |         """Write a Pandaframe to disk.
106 | 
107 |         Args:
108 |             filename (str): a string of the filename to write.
109 | 
110 |         """
111 |         # Begin refactor
112 |         self.data_frame.to_csv(filename, sep="\t", header=True, index=True)
113 | 
114 |     @classmethod
115 |     def read(cls, filename):
116 |         """Read from disk. Returns a wrapped Pandaframe from an hdf5 file
117 | 
118 |         Args:
119 |             filename (str): a string of the filename to write.
120 |         """
121 |         # NOTE this is a little duplicate with import_filtered_genes
122 |         # NOTE I don't have logger obj here, so talk to Mike if we just want to have
123 |         # duplicate code.
124 |         data_frame = pd.read_csv(
125 |             filename,
126 |             header="infer",
127 |             sep="\t",
128 |             index_col="Gene_Name",
129 |             dtype={
130 |                 "Start": "float64",
131 |                 "Stop": "float64",
132 |                 "Length": "float64",
133 |                 "Chromosome": str,
134 |                 "Strand": str,
135 |                 "Feature": str,
136 |                 "Genome_ID": str,
137 |             },
138 |         )
139 |         data_frame.sort_values(by=["Chromosome", "Start"], inplace=True)
140 |         genome_id_list = data_frame["Genome_ID"].unique().tolist()
141 |         if not genome_id_list:
142 |             raise RuntimeError("column 'Genome_ID' is empty")
143 |         elif len(genome_id_list) > 1:
144 |             raise RuntimeError("Genome IDs are are not unique: %s" % genome_id_list)
145 |         else:
146 |             genome_id = genome_id_list[0]  # MAGIC NUMBER list to string
147 | 
148 |         data_frame.drop(columns=["Genome_ID"], inplace=True)  # NOTE, have to do
149 |         # this to avoid it getting the column twice, perhaps refactor?
150 | 
151 |         new_instance = cls(data_frame, genome_id)
152 |         return new_instance
153 | 
154 |     def get_gene(self, gene_id):
155 |         """Return a GeneDatum for the gene identifier."""
156 | 
157 |         return GeneDatum(self.data_frame, gene_id)
158 | 
159 |     def add_genome_id(self):
160 |         """Add the genome_id as an extra column to the gene_dataframe"""
161 |         self.data_frame["Genome_ID"] = self.genome_id
162 | 
163 |     @property
164 |     def names(self):
165 |         """Yields the names for each gene."""
166 | 
167 |         return (name for name in self._names)
168 | 
169 |     @property
170 |     def chromosome_unique_id(self):
171 |         """Unique chromosome identifier for all the genes available.
172 | 
173 |         This will raise if the genes are not from the same chromosome,
174 |         for example you you didn't split the dataset wrt this data.
175 | 
176 |         Returns:
177 |             str: the unique identifier.
178 |         Raises:
179 |             RuntimeError: if multiple chromosomes are in the data frame (i.e. no unique).
180 |         """
181 | 
182 |         chromosome_list = self.data_frame.Chromosome.unique().tolist()
183 |         if not chromosome_list:
184 |             raise RuntimeError("column 'Chromosome' is empty")
185 |         elif len(chromosome_list) > 1:
186 |             raise RuntimeError("chromosomes are not unique: %s" % chromosome_list)
187 |         else:
188 |             return chromosome_list[0]  # MAGIC NUMBER list to string
189 | 
190 |     def __repr__(self):
191 |         """String representation for developer."""
192 | 
193 |         return "GeneData{}".format(self.data_frame)
194 | 


--------------------------------------------------------------------------------
/transposon/verify_cache.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Contains methods for verifying various cached input data.
  5 | """
  6 | 
  7 | __author__ = "Scott Teresi"
  8 | 
  9 | import os
 10 | import sys
 11 | import pandas as pd
 12 | from transposon.revise_annotation import ReviseAnno
 13 | from transposon.import_filtered_genes import import_filtered_genes
 14 | from transposon.import_filtered_TEs import import_filtered_TEs
 15 | 
 16 | 
 17 | def verify_chromosome_h5_cache(
 18 |     gene_data_obj,
 19 |     te_data_obj,
 20 |     g_filepath,
 21 |     t_filepath,
 22 |     reset_h5,  # TODO edit this later
 23 |     cache_location,
 24 |     genes_input_file,
 25 |     tes_input_file,
 26 |     chrom_id,
 27 |     logger,
 28 | ):
 29 |     """Determine whether or not previously saved gene_data and TransposonData
 30 |     exist in .tsv format. Each file represents either gene_data or
 31 |     TransposonData for one chromosome at a time. Save/update files as
 32 |     necessary.
 33 | 
 34 |     When are the tsv cache files written?
 35 |     1. Files will be written if there are no current corresponding files
 36 |     saved on disk.
 37 |     2. If a command-line option is passed to density.py to re-write the
 38 |     files, this option defaults to NOT re-write.
 39 |     # TODO check
 40 |     3. If enough time has passed between the creation of the H5 file and the
 41 |     current run-time of the program. TODO talk to Michael more about this.
 42 | 
 43 |     Args:
 44 |         gene_data_obj (gene_data): Instance of gene_data
 45 |         te_data_obj (TransposonData): Instance of TransposonData
 46 |         g_filepath (str): The string of the filename in which to save the
 47 |             gene_data as an H5 file.
 48 |         t_filepath (str): The string of the filename in which to save the
 49 |             TransposonData as an H5 file.
 50 |         reset_h5 (bool): Boolean, whether or not to completely rewrite the
 51 |             cache of all H5 files. True means that we will rewrite.
 52 |         cache_location (str): The location (file path) in which to store the
 53 |             cache gene and TE data files.
 54 |         genes_input_file (str): The file path of the file that was used to
 55 |             generate the gene_data instance.
 56 |         tes_input_file (str): The file path of the file that was used to
 57 |             generate the TransposonData instance.
 58 |         chrom_id (str): A string representation of the current chromosome. Used
 59 |         to name each H5 file.
 60 |     """
 61 |     if reset_h5:
 62 |         logger.info("overwrite: %s" % g_filepath)
 63 |         logger.info("overwrite: %s" % t_filepath)
 64 |         gene_data_obj.write(g_filepath)
 65 |         te_data_obj.write(t_filepath)
 66 | 
 67 |     if os.path.exists(g_filepath) and os.path.exists(t_filepath):
 68 |         gene_annot_time = os.path.getmtime(genes_input_file)
 69 |         te_annot_time = os.path.getmtime(tes_input_file)
 70 |         gene_h5_time = os.path.getmtime(g_filepath)
 71 |         te_h5_time = os.path.getmtime(t_filepath)
 72 | 
 73 |         if (gene_annot_time > gene_h5_time) and (te_annot_time > te_h5_time):
 74 |             logger.info("cache is too old for chromosome '%s'" % chrom_id)
 75 |             logger.info("write: %s" % g_filepath)
 76 |             logger.info("write: %s" % t_filepath)
 77 |             gene_data_obj.write(g_filepath)
 78 |             te_data_obj.write(t_filepath)
 79 | 
 80 |         elif (gene_annot_time < gene_h5_time) and (te_annot_time < te_h5_time):
 81 |             # No need to re-write a current cache
 82 |             return
 83 | 
 84 |     elif reset_h5 or (not (os.path.exists(g_filepath) and os.path.exists(t_filepath))):
 85 |         gene_data_obj.write(g_filepath)
 86 |         te_data_obj.write(t_filepath)
 87 |     else:
 88 |         logger.critical(
 89 |             """During the verification of the H5 cache nothing was
 90 |                        saved because 0 conditions were met."""
 91 |         )
 92 | 
 93 | 
 94 | def verify_TE_cache(tes_input_file, logger):
 95 |     """Read a preprocessed/filtered TE annotation file from disk; return a
 96 |     pandaframe of the file, no modifications are made to the data.
 97 | 
 98 |     Args:
 99 |         tes_input_file (str): A command line argument, this is the location
100 |             of the processed TE annotation file.
101 | 
102 |     Returns:
103 |         te_data (pandas.DataFrame): A pandas dataframe of the TE data
104 |     """
105 |     logger.info("Reading pre-filtered TE annotation file: %s" % tes_input_file)
106 |     te_data = import_filtered_TEs(tes_input_file, logger)
107 |     return te_data
108 | 
109 | 
110 | def verify_gene_cache(genes_input_file, logger):
111 |     """Read a preprocessed/filtered gene annotation file from disk; return a
112 |     pandaframe of the file, no modifications are made to the data.
113 | 
114 |     Args:
115 |         genes_input_file (str): A command line argument, this is the location
116 |             of the processed gene annotation file.
117 | 
118 |     Returns:
119 |         gene_data (pandas.DataFrame): the gene data container
120 |     """
121 |     logger.info("Reading pre-filtered gene annotation file: %s" % genes_input_file)
122 |     gene_data = import_filtered_genes(genes_input_file, logger)
123 |     return gene_data
124 | 
125 | 
126 | def revise_annotation(
127 |     te_data, revise_anno, revised_transposons_loc, revised_cache_loc, logger, genome_id
128 | ):
129 |     """Remove overlapping elements of the same type.
130 | 
131 |     Revises the annotation so that elements of the same type do not overlap at
132 |     all. Will essentially merge elements together, elongating them. This is
133 |     done so that the mathematics of density make sense. You can elect to not
134 |     use the revised annotation through a command-line argument to density.py,
135 |     however given that TEs often overlap with one another in annotatios (not
136 |     just being nested in one another) it can lead to some difficulties in
137 |     accurately assessing density and obfuscate the density results.
138 | 
139 |     Args:
140 |         te_data (pandas.core.DataFrame): A PandaFrame of the TE data,
141 |         previously imported from raw and filtered or imported from a previously
142 |         filtered data file that was saved to disk.
143 | 
144 |         revise_anno (bool): A boolean of whether or not to use/create a revised
145 |         annotation
146 | 
147 |         revised_transposons (str): A string representing the path of a
148 |         previously filtered (cleaned) and revised TE annotation.
149 | 
150 |         revised_cache_loc (): Directory for output files
151 | 
152 |         logger ():
153 | 
154 |         genome_id (str): String of the genome ID
155 | 
156 |     Returns:
157 |         te_data (pandaframe): A pandas dataframe of the revised TE data
158 |     """
159 | 
160 |     if os.path.exists(revised_transposons_loc) and not revise_anno:
161 |         logger.info("load revised TE: %s" % revised_transposons_loc)
162 |         te_data = import_filtered_TEs(revised_transposons_loc, logger)
163 |     else:
164 |         logger.info("creating revised TE dataset...")
165 |         logger.info("revising the TE dataset will take a long time!")
166 |         # N.B we want higher recursion limit for the code
167 |         sys.setrecursionlimit(11 ** 6)
168 |         revised_te_data = ReviseAnno(
169 |             te_data, revised_transposons_loc, revised_cache_loc, genome_id
170 |         )
171 |         revised_te_data.create_superfam()
172 |         revised_te_data.create_order()
173 |         revised_te_data.create_nameless()
174 |         revised_te_data.verify_files()
175 |         te_data = revised_te_data.whole_te_annotation
176 |     return te_data
177 | 


--------------------------------------------------------------------------------