├── VERSION ├── transposon ├── utils.py ├── test_utils.py ├── import_filtered_genes.py ├── import_filtered_TEs.py ├── worker.py ├── notes ├── replace_names.py ├── __init__.py ├── gene_data.py └── verify_cache.py ├── examples ├── Human │ ├── list_of_chr13_genes.txt │ ├── list_of_chr7_genes.txt │ ├── src │ │ ├── sync_remote_to_local.sh │ │ ├── sync_local_to_remote.sh │ │ ├── TE_Density_Human.sb │ │ ├── import_human_te_anno.py │ │ ├── retrieve_info_of_genes.py │ │ ├── import_human_gene_anno.py │ │ └── replace_human_TE_names.py │ ├── README.md │ └── Makefile ├── Rice_Synteny │ ├── src │ │ ├── .gitignore │ │ ├── sync_remote_to_local.sh │ │ ├── sync_local_to_remote.sh │ │ ├── Annotate_EDTA_Rice_Sativa.sb │ │ ├── Annotate_EDTA_Rice_Glaberrima.sb │ │ ├── TE_Density_Sativa.sb │ │ ├── TE_Density_Glaberrima.sb │ │ ├── generate_pairs.py │ │ ├── bargraphs.py │ │ ├── fix_cds_names.py │ │ ├── replace_names_rice.py │ │ ├── import_syntelogs.py │ │ ├── fix_fasta_names.py │ │ ├── import_rice_EDTA.py │ │ ├── find_abnormal_genes.py │ │ └── import_rice_gene_anno.py │ ├── README.md │ └── Makefile ├── Arabidopsis │ ├── .gitignore │ ├── src │ │ ├── sync_remote_to_local.sh │ │ ├── sync_local_to_remote.sh │ │ ├── Annotate_EDTA_Arabidopsis_thaliana.sb │ │ ├── TE_Density_Arabidopsis.sb │ │ ├── replace_names_Arabidopsis.py │ │ ├── import_Arabidopsis_EDTA.py │ │ └── import_Arabidopsis_gene_anno.py │ └── Makefile ├── Blueberry_Expression │ ├── src │ │ ├── sync_remote_to_local.sh │ │ ├── sync_local_to_remote.sh │ │ ├── Annotate_TE_Blueberry_EDTA.sb │ │ ├── TE_Density_Blueberry.sb │ │ ├── import_blueberry_gene_anno.py │ │ ├── replace_names_blueberry.py │ │ └── import_blueberry_EDTA.py │ ├── Makefile │ └── EDTA_Blueberry.out ├── README.md └── general_read_density_data.py ├── .flake8 ├── Conditions_1.jpg ├── Conditions_2.jpg ├── .vimrc ├── config ├── test_run_config.ini └── production_run_config.ini ├── tests ├── unit │ ├── pytest.ini │ ├── test_MergeWorker.py │ ├── test_OverlapManager.py │ ├── test_data.py │ ├── test_gene_data.py │ ├── test_transposon_data.py │ ├── test_WorkerProcess.py │ ├── test_preprocess.py │ ├── test_import_genes.py │ ├── test_OverlapData.py │ └── test_DensityData.py ├── input_data │ ├── Test_Genes_MergeData.tsv │ ├── Test_SingleC_ConcurrentOverlap_Nameless_Revision.tsv │ ├── Test_SingleC_ConcurrentOverlap_Order_Revision.tsv │ ├── Test_SingleC_ConcurrentOverlap_Superfam_Revision.tsv │ ├── Test_TEs_MergeData.tsv │ ├── Test_Genes_DensityData.tsv │ ├── Test_SingleC_SingleElongate_Nameless_Revision.tsv │ ├── Test_SingleC_SingleElongate_Order_Revision.tsv │ ├── Test_Preprocess_Cleaned_TEs_Unequal_Number_Chrom.tsv │ ├── Test_Preprocess_Cleaned_TEs_Unequal_Chrom_IDs.tsv │ ├── Test_Gene_Anno_Float_Conversion.tsv │ ├── Test_SingleC_MultiElongate_Order_Revision.tsv │ ├── Test_SingleC_MultiElongate_Nameless_Revision.tsv │ ├── Test_Preprocess_Cleaned_Genes.tsv │ ├── Test_SingleC_MultiElongate_Superfam_Revision.tsv │ ├── Test_SingleC_SingleElongate_Superfam_Revision.tsv │ └── Test_Genes_NormMatrix.tsv ├── test_transposon.tsv └── test_transposon_single_chrom.tsv ├── RELEASE ├── .gitignore ├── setup.py ├── requirements.txt ├── CHANGELOG ├── DESIGN └── Makefile /VERSION: -------------------------------------------------------------------------------- 1 | 2.1.3 2 | -------------------------------------------------------------------------------- /transposon/utils.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/Human/list_of_chr13_genes.txt: -------------------------------------------------------------------------------- 1 | BRCA2 2 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/.gitignore: -------------------------------------------------------------------------------- 1 | *.out 2 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 89 3 | exclude = .git,__pycache__ 4 | -------------------------------------------------------------------------------- /examples/Human/list_of_chr7_genes.txt: -------------------------------------------------------------------------------- 1 | GUSB 2 | CFTR 3 | PMS2 4 | NT5C3A 5 | -------------------------------------------------------------------------------- /Conditions_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjteresi/TE_Density/HEAD/Conditions_1.jpg -------------------------------------------------------------------------------- /Conditions_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjteresi/TE_Density/HEAD/Conditions_2.jpg -------------------------------------------------------------------------------- /.vimrc: -------------------------------------------------------------------------------- 1 | let HIGHLIGHT_COLS=90 "overwrite this in local .vimrc for per project setting 2 | -------------------------------------------------------------------------------- /config/test_run_config.ini: -------------------------------------------------------------------------------- 1 | [density_parameters] 2 | first_window_size = 400 3 | window_delta = 200 4 | last_window_size = 400 5 | -------------------------------------------------------------------------------- /config/production_run_config.ini: -------------------------------------------------------------------------------- 1 | [density_parameters] 2 | first_window_size = 500 3 | window_delta = 500 4 | last_window_size = 10000 5 | -------------------------------------------------------------------------------- /examples/Arabidopsis/.gitignore: -------------------------------------------------------------------------------- 1 | results/tables/** 2 | results/graphs/** 3 | results/chi_squared/** 4 | results/arrays_for_pat_1KB_up_down_11_29_2021.tar.gz 5 | -------------------------------------------------------------------------------- /tests/unit/pytest.ini: -------------------------------------------------------------------------------- 1 | # pytest.ini 2 | [pytest] 3 | filterwarnings= default 4 | ignore:.*is deprecated.*:Warning 5 | error::DeprecationWarning:importlib.* 6 | -------------------------------------------------------------------------------- /examples/Human/src/sync_remote_to_local.sh: -------------------------------------------------------------------------------- 1 | rsync -ave ssh teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Human/ /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Human/ 2 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/sync_remote_to_local.sh: -------------------------------------------------------------------------------- 1 | rsync -ave ssh teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/ /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Rice/ 2 | -------------------------------------------------------------------------------- /RELEASE: -------------------------------------------------------------------------------- 1 | 2.0.0 2 | - Upgrade h5py 2.10 --> 3.7, resulted in incompatibility w/ output H5 files 3 | + string variables now require explicit de/encode 4 | + reading previous output files not supported 5 | 6 | 1.0.0 7 | - initial release 8 | -------------------------------------------------------------------------------- /examples/Arabidopsis/src/sync_remote_to_local.sh: -------------------------------------------------------------------------------- 1 | rsync -ave ssh teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Arabidopsis/ /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Arabidopsis/ 2 | -------------------------------------------------------------------------------- /examples/Human/src/sync_local_to_remote.sh: -------------------------------------------------------------------------------- 1 | rsync -h -v -r -P -t -e ssh /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Human --chmod=Dg+s teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/ 2 | -------------------------------------------------------------------------------- /tests/input_data/Test_Genes_MergeData.tsv: -------------------------------------------------------------------------------- 1 | Gene_Name Chromosome Feature Start Stop Strand Length 2 | dummy_gene_1 Fvb1-1 gene 100.0 2300.0 + 2201.0 3 | dummy_gene_2 Fvb1-1 gene 6000.0 7900.0 - 1901.0 4 | dummy_gene_3 Fvb1-1 gene 8000.0 8500.0 - 501.0 5 | -------------------------------------------------------------------------------- /examples/Blueberry_Expression/src/sync_remote_to_local.sh: -------------------------------------------------------------------------------- 1 | rsync -ave ssh teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Blueberry/ /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Blueberry/ 2 | 3 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/sync_local_to_remote.sh: -------------------------------------------------------------------------------- 1 | rsync -h -v -r -P -t -e ssh /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Rice --chmod=Dg+s teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/ 2 | -------------------------------------------------------------------------------- /examples/Arabidopsis/src/sync_local_to_remote.sh: -------------------------------------------------------------------------------- 1 | rsync -h -v -r -P -t -e ssh /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Arabidopsis --chmod=Dg+s teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Arabidopsis 2 | -------------------------------------------------------------------------------- /examples/Blueberry_Expression/src/sync_local_to_remote.sh: -------------------------------------------------------------------------------- 1 | rsync -h -v -r -P -t -e ssh /home/scott/Documents/Uni/Research/Projects/TE_Density_Example_Data/Blueberry --chmod=Dg+s teresisc@rsync.hpcc.msu.edu:/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Blueberry/ 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.swp 2 | **/*.swo 3 | **/*.orig 4 | **/*.pyc 5 | tmp 6 | tags 7 | cumtime 8 | .vscode 9 | tests/output_data/* 10 | tests/test_h5_cache_loc/* 11 | tests/input_data/test_swap_file.h5 12 | tests/input_data/test_swap_file_SenseSwapped.HDF5 13 | 14 | /filtered_input_data/revised_input_data/* 15 | !/filtered_input_data/revised_input_data/.gitkeep 16 | -------------------------------------------------------------------------------- /tests/input_data/Test_SingleC_ConcurrentOverlap_Nameless_Revision.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 3 | Fvb1-1 3350.0 3957.0 - LTR Gypsy 608.0 4 | Fvb1-1 4209.0 4761.0 - LTR Gypsy 553.0 5 | Fvb1-1 4761.0 5000.0 - DNA TIR 239.0 6 | Fvb1-1 12100.0 12300.0 - TIR hAT 201.0 7 | Fvb1-1 12150.0 12200.0 + DNA MITE 51.0 8 | -------------------------------------------------------------------------------- /tests/input_data/Test_SingleC_ConcurrentOverlap_Order_Revision.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 3 | Fvb1-1 3350.0 3957.0 - LTR Gypsy 608.0 4 | Fvb1-1 4209.0 4761.0 - LTR Gypsy 553.0 5 | Fvb1-1 4761.0 5000.0 - LTR Copia 239.0 6 | Fvb1-1 12100.0 12300.0 - TIR hAT 201.0 7 | Fvb1-1 12150.0 12200.0 + TIR hAT 51.0 8 | -------------------------------------------------------------------------------- /tests/input_data/Test_SingleC_ConcurrentOverlap_Superfam_Revision.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 3 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 4 | Fvb1-1 4209.0 4761.0 - LTR Gypsy 553.0 5 | Fvb1-1 4761.0 5000.0 - LTR Gypsy 239.0 6 | Fvb1-1 12100.0 12300.0 - TIR hAT 201.0 7 | Fvb1-1 12150.0 12200.0 + TIR hAT 51.0 8 | -------------------------------------------------------------------------------- /tests/input_data/Test_TEs_MergeData.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350 4100 - Total_TE_Density Total_TE_Density 751 3 | Fvb1-1 4870 5229 - Total_TE_Density Total_TE_Density 360 4 | Fvb1-1 8459 9370 - Total_TE_Density Total_TE_Density 912 5 | Fvb1-1 9556 9677 + Total_TE_Density Total_TE_Density 122 6 | Fvb1-1 9678 11249 - Total_TE_Density Total_TE_Density 1572 7 | Fvb1-1 11250 11469 + Total_TE_Density Total_TE_Density 220 8 | -------------------------------------------------------------------------------- /tests/input_data/Test_Genes_DensityData.tsv: -------------------------------------------------------------------------------- 1 | Gene_Name Chromosome Feature Start Stop Strand Length 2 | dummy1 Fvb1-1 gene 41.0 2396.0 + 2356.0 3 | dummy2 Fvb1-1 gene 5556.0 7978.0 - 2423.0 4 | dummy3 Fvb1-1 gene 8487.0 8797.0 - 311.0 5 | dummy4 Fvb1-1 gene 9361.0 9658.0 + 298.0 6 | dummy5 Fvb1-1 gene 10000.0 12000.0 + 2001.0 7 | dummy6 Fvb1-1 gene 11000.0 12000.0 + 1001.0 8 | dummy7 Fvb1-1 gene 12000.0 13000.0 + 1001.0 9 | dummy8 Fvb1-1 gene 13000.0 14000.0 + 1001.0 10 | -------------------------------------------------------------------------------- /examples/Human/README.md: -------------------------------------------------------------------------------- 1 | # Genome: 2 | 3 | I acquired the human genome gene annotation through [CoGe](https://genomevolution.org/coge/GenomeInfo.pl?gid=25747). And I acquired a TE annotation through the [UCSC Genome Browser](https://genome-euro.ucsc.edu/cgi-bin/hgTables) using the options groups: repeats, track: RepeatMasker, and output format: all fields. I had to do some reformatting for this TE annotation to get it to be compliant with TE Density and those scripts are present in: xyz (**CITE**)) 4 | 5 | Reformatting of human gene annotation file, *gencode.v37.annotation.gff3*, use commands *python examples/Human/import_scripts/import_human_gene_anno.py* on the gene file. 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name="te-density", 6 | version="2.1.1", 7 | description="Calculates Transposable Element density", 8 | url="https://github.com/sjteresi/TE_Density", 9 | packages=find_packages(), 10 | author="Scott Teresi, Michael Teresi", 11 | license = "GPL-3.0", 12 | install_requires=[ 13 | 'coloredlogs>=15.0', 14 | 'h5py>=3.7', 15 | 'matplotlib>=3.6', 16 | 'numpy>=1.23', 17 | 'numexpr>=2.8.3', 18 | 'pandas>=1.5', 19 | 'scipy>=1.9', 20 | 'tqdm>=4.64', 21 | ], 22 | scripts=[ 23 | './process_genome.py' 24 | ], 25 | ) 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attrs==22.1.0 2 | black==23.3.0 3 | click==8.1.3 4 | coloredlogs==15.0.1 5 | contourpy==1.0.6 6 | cycler==0.11.0 7 | exceptiongroup==1.0.4 8 | fonttools==4.38.0 9 | h5py==3.8.0 10 | humanfriendly==10.0 11 | iniconfig==1.1.1 12 | kiwisolver==1.4.4 13 | matplotlib==3.6.2 14 | mypy-extensions==0.4.3 15 | numexpr==2.8.4 16 | numpy==1.24.2 17 | packaging==23.1 18 | pandas==1.5.2 19 | pathspec==0.10.2 20 | Pillow==9.3.0 21 | platformdirs==2.5.4 22 | pluggy==1.0.0 23 | pyparsing==3.0.9 24 | pytest==7.3.1 25 | python-dateutil==2.8.2 26 | pytz==2022.6 27 | scipy==1.9.3 28 | six==1.16.0 29 | tables==3.7.0 30 | tomli==2.0.1 31 | tqdm==4.65.0 32 | typing-extensions==3.7.4.3 33 | wcwidth==0.1.7 34 | wrapt==1.11.2 35 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | 2.1.3 2 | - fleshout import_filtere_genes exception message 3 | 4 | 2.1.2 5 | - refactoring, add _DensitySubset 6 | 7 | 2.1.1 8 | - add initial setup.py 9 | - remove tables (pytables) 10 | - upgrade numpy 1.23.5 --> 1.24.2 11 | - upgrade h5py 3.7.0 --> 3.8.0 12 | - test on cpython 3.11.3+ 13 | 14 | 2.1.0 15 | - upgrade py3.8 --> py3.10 16 | - upgrade dependencies 17 | 18 | 2.0.0 19 | - upgrade h5py 2.10 --> 3.7 20 | 21 | 1.0.1 22 | - use default h5py.File raw data chunk cache size (rdcc_nbytes), 23 | which may reduce ram usage in some cases 24 | 25 | 1.0.0 26 | - upgrade numpy 1.20.2 --> 1.23.3 27 | - upgrade pandas 1.0.5 --> 1.4.4 28 | - upgrade scipy 1.6.3 --> 1.9.1 29 | - add note on python3.8 / dependencies 30 | -------------------------------------------------------------------------------- /tests/input_data/Test_SingleC_SingleElongate_Nameless_Revision.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 3 | Fvb1-1 3795.0 4133.0 - DNA Hat 339.0 4 | Fvb1-1 4209.0 4761.0 - LTR Copia 553.0 5 | Fvb1-1 4670.0 5229.0 - LTR Gypsy 560.0 6 | Fvb1-1 8459.0 9266.0 - Completely_Unknown Completely_Unknown 808.0 7 | Fvb1-1 8794.0 9370.0 + Completely_Unknown Completely_Unknown 577.0 8 | Fvb1-1 9556.0 9677.0 + Completely_Unknown Completely_Unknown 122.0 9 | Fvb1-1 9675.0 11249.0 - Completely_Unknown Completely_Unknown 1575.0 10 | Fvb1-1 23456.0 24419.0 - TIR CACTA 963.0 11 | Fvb1-1 24415.0 24625.0 - LTR Copia 211.0 12 | Fvb1-1 27000.0 27500.0 - DNA MULE 501.0 13 | Fvb1-1 27200.0 27550.0 - LTR Copia 351.0 14 | -------------------------------------------------------------------------------- /tests/input_data/Test_SingleC_SingleElongate_Order_Revision.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 3 | Fvb1-1 3795.0 4133.0 - LTR Gypsy 339.0 4 | Fvb1-1 4209.0 4761.0 - LTR Copia 553.0 5 | Fvb1-1 4670.0 5229.0 - LTR Gypsy 560.0 6 | Fvb1-1 8459.0 9266.0 - Completely_Unknown Completely_Unknown 808.0 7 | Fvb1-1 8794.0 9370.0 + Completely_Unknown Completely_Unknown 577.0 8 | Fvb1-1 9556.0 9677.0 + Completely_Unknown Completely_Unknown 122.0 9 | Fvb1-1 9675.0 11249.0 - Completely_Unknown Completely_Unknown 1575.0 10 | Fvb1-1 23456.0 24419.0 - TIR CACTA 963.0 11 | Fvb1-1 24415.0 24625.0 - TIR Test 211.0 12 | Fvb1-1 24000.0 24500.0 - DNA MULE 501.0 13 | Fvb1-1 24200.0 24550.0 - LTR Copia 351.0 14 | -------------------------------------------------------------------------------- /tests/input_data/Test_Preprocess_Cleaned_TEs_Unequal_Number_Chrom.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | VaccDscaff1 280.0 369.0 + LTR Unknown_LTR_Superfam 90.0 3 | VaccDscaff1 396.0 494.0 - TIR Tc1-Mariner 99.0 4 | VaccDscaff1 495.0 783.0 + TIR Tc1-Mariner 289.0 5 | VaccDscaff1 3197.0 3361.0 + TIR hAT 165.0 6 | VaccDscaff1 3722.0 3847.0 + LTR Copia 126.0 7 | VaccDscaff1 3749.0 3981.0 - TIR Tc1-Mariner 233.0 8 | VaccDscaff1 4349.0 4509.0 - Helitron Helitron 161.0 9 | VaccDscaff1 4451.0 4663.0 + Helitron Helitron 213.0 10 | VaccDscaff1 4591.0 4681.0 + TIR Mutator 91.0 11 | VaccDscaff2 4884.0 5172.0 - LTR Unknown_LTR_Superfam 289.0 12 | VaccDscaff2 7111.0 7267.0 - Helitron Helitron 157.0 13 | VaccDscaff2 7174.0 7288.0 + TIR PIF-Harbinger 115.0 14 | VaccDscaff2 7209.0 7400.0 + Helitron Helitron 192.0 15 | VaccDscaff2 10667.0 10833.0 + LTR Copia 167.0 16 | -------------------------------------------------------------------------------- /tests/input_data/Test_Preprocess_Cleaned_TEs_Unequal_Chrom_IDs.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | VaccDscaff24 280.0 369.0 + LTR Unknown_LTR_Superfam 90.0 3 | VaccDscaff24 396.0 494.0 - TIR Tc1-Mariner 99.0 4 | VaccDscaff24 495.0 783.0 + TIR Tc1-Mariner 289.0 5 | VaccDscaff24 3197.0 3361.0 + TIR hAT 165.0 6 | VaccDscaff24 3722.0 3847.0 + LTR Copia 126.0 7 | VaccDscaff24 3749.0 3981.0 - TIR Tc1-Mariner 233.0 8 | VaccDscaff24 4349.0 4509.0 - Helitron Helitron 161.0 9 | VaccDscaff24 4451.0 4663.0 + Helitron Helitron 213.0 10 | VaccDscaff24 4591.0 4681.0 + TIR Mutator 91.0 11 | VaccDscaff24 4884.0 5172.0 - LTR Unknown_LTR_Superfam 289.0 12 | VaccDscaff24 7111.0 7267.0 - Helitron Helitron 157.0 13 | VaccDscaff24 7174.0 7288.0 + TIR PIF-Harbinger 115.0 14 | VaccDscaff24 7209.0 7400.0 + Helitron Helitron 192.0 15 | VaccDscaff24 10667.0 10833.0 + LTR Copia 167.0 16 | -------------------------------------------------------------------------------- /DESIGN: -------------------------------------------------------------------------------- 1 | # Densities 2 | 3 | 4 | # Pipeline 5 | Design for processing the gene / transposon files. 6 | 7 | Input gene / transposon files, output density files. 8 | Calculate the multiple TE densities for each gene, with a sweep of window values. 9 | 10 | ## Steps 11 | The pipeline is a generic split / apply / combine. 12 | Each density is independent, however, the densities are accumulated (summed) at the end. 13 | 14 | 15 | 0. Input gene / TE files; input window length 16 | 1. Preprocess 17 | - chunkify gene / TE based on chromosome 18 | - list sub-gene / sub-TE pairs 19 | 2. Split wrt gene names / Merge wrt TE overlap 20 | - for each sub-gene / sub-TE pair 21 | - for each window 22 | - start workers 23 | - request overlap for each gene name (handling failed requests!) 24 | - merge worker results (sums of overlaps ) 25 | - calculate densities, write to file 26 | -------------------------------------------------------------------------------- /tests/unit/test_MergeWorker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Unit test MergeWorker. 5 | """ 6 | 7 | __author__ = "Michael Teresi" 8 | 9 | import logging 10 | import os 11 | import pytest 12 | import tempfile 13 | 14 | import coloredlogs 15 | import numpy as np 16 | import pandas as pd 17 | 18 | from transposon.merge_data import MergeData 19 | from transposon.merge_data import _MergeConfigSink, _MergeConfigSource, _SummationArgs 20 | from transposon.transposon_data import TransposonData 21 | from transposon.gene_data import GeneData 22 | from transposon.overlap import OverlapData, OverlapWorker 23 | 24 | @pytest.fixture() 25 | def temp_dir(): 26 | """Temporary directory.""" 27 | 28 | with tempfile.TemporaryDirectory() as dir: 29 | yield dir 30 | 31 | @pytest.fixture(scope="module") 32 | def overlap_data(): 33 | pass 34 | # scope=module b/c we can reuse the files b/c they are read only 35 | -------------------------------------------------------------------------------- /examples/Arabidopsis/src/Annotate_EDTA_Arabidopsis_thaliana.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash -login 2 | 3 | #SBATCH -J EDTA_Arabidopsis 4 | #SBATCH --time=100:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=25 7 | #SBATCH --mem-per-cpu=30G 8 | #SBATCH -o EDTA_Arabidopsis.out 9 | #-------------------------------------------------------- 10 | # NOTE the user should change these paths to match their machine 11 | EDTA_DIR=/mnt/research/edgerpat_lab/EDTA 12 | GENOME_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Arabidopsis/Sequences 13 | OUT_DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Arabidopsis/TEs 14 | 15 | 16 | # NOTE 17 | # Do these commands ahead of trying to run EDTA to properly install 18 | # git clone https://github.com/oushujun/EDTA.git 19 | # cd EDTA 20 | # conda env create -f EDTA.yml 21 | 22 | module purge 23 | module load Conda/3 24 | conda activate EDTA # activate the conda environment of packages 25 | 26 | cd $OUT_DATA_DIR # cd to output data dir for any extraneous files that get outputted 27 | 28 | # Run EDTA 29 | perl $EDTA_DIR/EDTA.pl --genome $GENOME_DIR/TAIR10_chr_main_chromosomes.fas --cds $GENOME_DIR/Arabidopsis_CDS.fasta --sensitive 1 --anno 1 --threads 25 30 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/Annotate_EDTA_Rice_Sativa.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash -login 2 | 3 | #SBATCH -J EDTA_Sativa 4 | #SBATCH --time=100:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=25 7 | #SBATCH --mem-per-cpu=30G 8 | #SBATCH -o EDTA_Sativa.out 9 | #-------------------------------------------------------- 10 | # NOTE the user should change these paths to match their machine 11 | EDTA_DIR=/mnt/research/edgerpat_lab/EDTA 12 | GENOME_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/Oryza_Sativa/Sequences 13 | OUT_DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/Oryza_Sativa/TEs 14 | 15 | 16 | # NOTE 17 | # Do these commands ahead of trying to run EDTA to properly install 18 | # git clone https://github.com/oushujun/EDTA.git 19 | # cd EDTA 20 | # conda env create -f EDTA.yml 21 | 22 | module purge 23 | module load Conda/3 24 | conda activate EDTA # activate the conda environment of packages 25 | 26 | cd $OUT_DATA_DIR # cd to output data dir for any extraneous files that get outputted 27 | 28 | # Run EDTA 29 | perl $EDTA_DIR/EDTA.pl --genome $GENOME_DIR/Oryza_Sativa_NewNames.fasta --cds $GENOME_DIR/Oryza_Sativa_CDS_NewNames.fasta --sensitive 1 --anno 1 --threads 25 30 | -------------------------------------------------------------------------------- /examples/Blueberry_Expression/src/Annotate_TE_Blueberry_EDTA.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash -login 2 | 3 | #SBATCH -J EDTA_Blueberry 4 | #SBATCH --time=167:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=25 7 | #SBATCH --mem-per-cpu=30G 8 | #SBATCH -o EDTA_Blueberry.out 9 | #SBATCH -e EDTA_Blueberry.err 10 | #-------------------------------------------------------- 11 | # NOTE the user should change these paths to match their machine 12 | EDTA_DIR=/mnt/research/edgerpat_lab/EDTA 13 | GENOME_DIR=/mnt/research/edgerpat_lab/Scotty/Blueberry_Data/Genome 14 | OUT_DATA_DIR=/mnt/research/edgerpat_lab/Scotty/Blueberry_Data/Blueberry_TE_Density/Annotation 15 | 16 | 17 | # NOTE 18 | # Do these commands ahead of trying to run EDTA to properly install 19 | # git clone https://github.com/oushujun/EDTA.git 20 | # cd EDTA 21 | # conda env create -f EDTA.yml 22 | 23 | module purge 24 | module load Conda/3 25 | conda activate EDTA # activate the conda environment of packages 26 | 27 | cd $OUT_DATA_DIR # cd to output data dir for any extraneous files that get outputted 28 | 29 | # Run EDTA 30 | perl $EDTA_DIR/EDTA.pl --genome $GENOME_DIR/Vaccinium_corymbosum.faa --cds $GENOME_DIR/Vacc_c_CoGe_CDS.fasta --sensitive 1 --anno 1 --threads 25 31 | -------------------------------------------------------------------------------- /tests/input_data/Test_Gene_Anno_Float_Conversion.tsv: -------------------------------------------------------------------------------- 1 | Fvb1-1 maker gene 41 2396 . + . ID=maker-Fvb1-1-snap-gene-0.15;Name=maker-Fvb1-1-snap-gene-0.15 2 | Fvb1-1 maker gene 5556 7978 . - . ID=maker-Fvb1-1-augustus-gene-0.13;Name=maker-Fvb1-1-augustus-gene-0.13 3 | Fvb1-1 maker gene 8487 8797 . - . ID=maker-Fvb1-1-snap-gene-0.18;Name=maker-Fvb1-1-snap-gene-0.18 4 | Fvb1-1 maker gene 9361 9658 . + . ID=snap_masked-Fvb1-1-processed-gene-0.6;Name=snap_masked-Fvb1-1-processed-gene-0.6 5 | Fvb1-1 maker gene 11127 11411 . - . ID=augustus_masked-Fvb1-1-processed-gene-0.4;Name=augustus_masked-Fvb1-1-processed-gene-0.4 6 | Fvb1-1 maker gene 84598 86703 . + . ID=maker-Fvb1-1-snap-gene-0.16;Name=maker-Fvb1-1-snap-gene-0.16 7 | Fvb1-1 maker gene 117287120 117715971 . + . ID=maker-Fvb1-1-augustus-gene-3.17;Name=maker-Fvb1-1-augustus-gene-3.17 8 | Fvb1-1 maker gene 118974314397 118974317655 . - . ID=maker-Fvb1-1-augustus-gene-3.19;Name=maker-Fvb1-1-augustus-gene-3.19 9 | Fvb1-1 maker gene 22456307315831 22456307317608 . + . ID=maker-Fvb1-1-snap-gene-3.20;Name=maker-Fvb1-1-snap-gene-3.20 10 | Fvb1-1 maker gene 88877765432319026 88877765432320584 . + . ID=augustus_masked-Fvb1-1-processed-gene-3.0;Name=augustus_masked-Fvb1-1-processed-gene-3.0 11 | -------------------------------------------------------------------------------- /tests/input_data/Test_SingleC_MultiElongate_Order_Revision.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 3 | Fvb1-1 3795.0 4215.0 - LTR Copia 420.0 4 | Fvb1-1 4209.0 4761.0 - LTR Copia 553.0 5 | Fvb1-1 4670.0 5229.0 - LTR Gypsy 360.0 6 | Fvb1-1 8459.0 9266.0 - Completely_Unknown Completely_Unknown 808.0 7 | Fvb1-1 8794.0 9370.0 + Completely_Unknown Completely_Unknown 577.0 8 | Fvb1-1 9256.0 9677.0 + Completely_Unknown Completely_Unknown 421.0 9 | Fvb1-1 9700.0 10000.0 - TIR hAT 301.0 10 | Fvb1-1 9678.0 11249.0 - Completely_Unknown Completely_Unknown 1572.0 11 | Fvb1-1 11248.0 11469.0 + Completely_Unknown Completely_Unknown 220.0 12 | Fvb1-1 11450.0 11886.0 + Completely_Unknown Completely_Unknown 298.0 13 | Fvb1-1 12193.0 12404.0 - TIR hAT 212.0 14 | Fvb1-1 13625.0 13774.0 + Completely_Unknown Completely_Unknown 150.0 15 | Fvb1-1 13799.0 13892.0 - TIR Mutator 94.0 16 | Fvb1-1 13892.0 14156.0 + Helitron Helitron 265.0 17 | Fvb1-1 14154.0 15323.0 + Helitron Helitron 1170.0 18 | Fvb1-1 15320.0 16343.0 - Helitron Helitron 1114.0 19 | Fvb1-1 20236.0 20466.0 - LTR Copia 231.0 20 | Fvb1-1 20465.0 20899.0 + LTR Unknown_LTR_Superfam 435.0 21 | Fvb1-1 20880.0 22846.0 - LTR Gypsy 1966.0 22 | -------------------------------------------------------------------------------- /tests/input_data/Test_SingleC_MultiElongate_Nameless_Revision.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 3 | Fvb1-1 3795.0 4215.0 - LTR Copia 420.0 4 | Fvb1-1 4209.0 4761.0 - LTR Copia 553.0 5 | Fvb1-1 4670.0 5229.0 - LTR Gypsy 360.0 6 | Fvb1-1 8459.0 9266.0 - Completely_Unknown Completely_Unknown 808.0 7 | Fvb1-1 8794.0 9370.0 + Completely_Unknown Completely_Unknown 577.0 8 | Fvb1-1 9256.0 9677.0 + Completely_Unknown Completely_Unknown 421.0 9 | Fvb1-1 9678.0 11249.0 - Completely_Unknown Completely_Unknown 1572.0 10 | Fvb1-1 9700.0 10000.0 - TIR hAT 301.0 11 | Fvb1-1 11248.0 11469.0 + Completely_Unknown Completely_Unknown 220.0 12 | Fvb1-1 11450.0 11886.0 + Completely_Unknown Completely_Unknown 298.0 13 | Fvb1-1 12193.0 12404.0 - TIR hAT 212.0 14 | Fvb1-1 13625.0 13774.0 + Completely_Unknown Completely_Unknown 150.0 15 | Fvb1-1 13799.0 13892.0 - TIR Mutator 94.0 16 | Fvb1-1 13890.0 14156.0 + Helitron Helitron 265.0 17 | Fvb1-1 14154.0 15323.0 + Helitron Helitron 1170.0 18 | Fvb1-1 15320.0 16343.0 - Helitron Helitron 1114.0 19 | Fvb1-1 20236.0 20466.0 - LTR Copia 231.0 20 | Fvb1-1 20465.0 20899.0 + LTR Unknown_LTR_Superfam 435.0 21 | Fvb1-1 20880.0 22846.0 - LTR Gypsy 1966.0 22 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/Annotate_EDTA_Rice_Glaberrima.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash -login 2 | 3 | #SBATCH -J EDTA_Glaberrima 4 | #SBATCH --time=167:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=25 7 | #SBATCH --mem-per-cpu=30G 8 | #SBATCH -o EDTA_Glaberrima.out 9 | #-------------------------------------------------------- 10 | # NOTE the user should change these paths to match their machine 11 | EDTA_DIR=/mnt/research/edgerpat_lab/EDTA 12 | GENOME_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/Oryza_Glaberrima/Sequences 13 | OUT_DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density_Example_Data/Rice/Oryza_Glaberrima/TEs 14 | 15 | 16 | # NOTE 17 | # Do these commands ahead of trying to run EDTA to properly install 18 | # git clone https://github.com/oushujun/EDTA.git 19 | # cd EDTA 20 | # conda env create -f EDTA.yml 21 | 22 | module purge 23 | module load Conda/3 24 | conda activate EDTA # activate the conda environment of packages 25 | 26 | cd $OUT_DATA_DIR # cd to output data dir for any extraneous files that get outputted 27 | 28 | # Run EDTA 29 | perl $EDTA_DIR/EDTA.pl --genome $GENOME_DIR/Oryza_Glaberrima_NewNames.fasta --cds $GENOME_DIR/Oryza_Glaberrima_CDS_NewNames.fasta --sensitive 1 --anno 1 --threads 25 30 | -------------------------------------------------------------------------------- /tests/input_data/Test_Preprocess_Cleaned_Genes.tsv: -------------------------------------------------------------------------------- 1 | Gene_Name Chromosome Feature Start Stop Strand Length 2 | maker-VaccDscaff1-snap-gene-0.31 VaccDscaff1 gene 893.0 24185.0 + 23293.0 3 | maker-VaccDscaff1-snap-gene-0.32 VaccDscaff1 gene 24880.0 31033.0 + 6154.0 4 | maker-VaccDscaff1-snap-gene-0.36 VaccDscaff1 gene 31517.0 32457.0 - 941.0 5 | maker-VaccDscaff1-snap-gene-0.37 VaccDscaff1 gene 32462.0 39522.0 - 7061.0 6 | snap_masked-VaccDscaff1-processed-gene-0.15 VaccDscaff1 gene 49288.0 54958.0 + 5671.0 7 | maker-VaccDscaff1-augustus-gene-0.27 VaccDscaff1 gene 62151.0 89671.0 + 27521.0 8 | maker-VaccDscaff1-augustus-gene-0.30 VaccDscaff1 gene 86018.0 89723.0 - 3706.0 9 | maker-VaccDscaff1-augustus-gene-1.27 VaccDscaff1 gene 97467.0 99179.0 - 1713.0 10 | augustus_masked-VaccDscaff1-processed-gene-1.5 VaccDscaff1 gene 107244.0 110895.0 - 3652.0 11 | maker-VaccDscaff1-snap-gene-1.29 VaccDscaff1 gene 110385.0 118861.0 + 8477.0 12 | maker-VaccDscaff1-augustus-gene-1.24 VaccDscaff1 gene 128901.0 132573.0 + 3673.0 13 | maker-VaccDscaff1-augustus-gene-1.25 VaccDscaff1 gene 147894.0 149490.0 + 1597.0 14 | maker-VaccDscaff1-snap-gene-1.32 VaccDscaff1 gene 179007.0 185476.0 + 6470.0 15 | maker-VaccDscaff1-augustus-gene-1.28 VaccDscaff1 gene 186127.0 189575.0 - 3449.0 16 | -------------------------------------------------------------------------------- /examples/Human/src/TE_Density_Human.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash -login 2 | 3 | #SBATCH -J Human_Density 4 | #SBATCH --time=165:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=2 7 | #SBATCH --mem-per-cpu=90G 8 | #SBATCH -o Human_Density.out 9 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Human/Human_Density.out 10 | #-------------------------------------------------------- 11 | echo "" 12 | echo "Job Information" 13 | echo "Job ID:" $SLURM_JOB_ID 14 | echo "" 15 | 16 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density 17 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/filtered_input_data 18 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Human/ 19 | GENOME="Human_Chrom_Subset" 20 | 21 | # Load the Python on HPCC 22 | module purge 23 | module load GCC/10.2.0 Python/3.8.10 24 | 25 | # Source the Python packages that are version controlled 26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate 27 | 28 | # Go to project directory 29 | cd $ROOT_DIR 30 | 31 | # Make output dir 32 | mkdir -p $OUT_DIR 33 | 34 | # Run the code 35 | python process_genome.py $DATA_DIR/Cleaned_Chr7_13_Human_Genes.tsv $DATA_DIR/Cleaned_Chr7_13_Human_TEs.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini -n 2 --reset_h5 -o $OUT_DIR 36 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/TE_Density_Sativa.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash -login 2 | 3 | #SBATCH -J Sativa_Density 4 | #SBATCH --time=24:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --mem-per-cpu=25G 8 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Rice/Sativa_Density.out 9 | #-------------------------------------------------------- 10 | echo "" 11 | echo "Job Information" 12 | echo "Job ID:" $SLURM_JOB_ID 13 | echo "" 14 | 15 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density 16 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/filtered_input_data/ 17 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Rice/ 18 | GENOME="Sativa" 19 | 20 | 21 | # Load the Python on HPCC 22 | module purge 23 | module load GCC/10.2.0 Python/3.8.10 24 | 25 | # Source the Python packages that are version controlled 26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate 27 | 28 | # Go to project directory 29 | cd $ROOT_DIR 30 | 31 | # Make output dir 32 | mkdir -p $OUT_DIR 33 | 34 | # Run the code 35 | python process_genome.py $DATA_DIR/Cleaned_Oryza_sativa.IRGSP-1.0.50.tsv $DATA_DIR/Cleaned_Oryza_Sativa_NewNames.fasta.mod.EDTA.TEanno.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini -n 12 --reset_h5 -o $OUT_DIR 36 | -------------------------------------------------------------------------------- /examples/Blueberry_Expression/src/TE_Density_Blueberry.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash -login 2 | 3 | #SBATCH -J Blueberry_TE_Density 4 | #SBATCH --time=36:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=20 7 | #SBATCH --mem-per-cpu=32G 8 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Blueberry/Blueberry_Density.out 9 | #-------------------------------------------------------- 10 | echo "" 11 | echo "Job Information" 12 | echo "Job ID:" $SLURM_JOB_ID 13 | echo "" 14 | 15 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density 16 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Blueberry/filtered_input_data 17 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Blueberry/ 18 | GENOME="Vacc_Cory" 19 | 20 | 21 | # Load the Python on HPCC 22 | module purge 23 | module load GCC/10.2.0 Python/3.8.10 24 | 25 | # Source the Python packages that are version controlled 26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate 27 | 28 | # Go to project directory 29 | cd $ROOT_DIR 30 | 31 | # Make output dir 32 | mkdir -p $OUT_DIR 33 | 34 | # Run the code 35 | python $ROOT_DIR/process_genome.py $DATA_DIR/Cleaned_Blueberry_Genes.tsv $DATA_DIR/Cleaned_Blueberry_EDTA_TEs.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini -n 20 --reset_h5 -o $OUT_DIR 36 | -------------------------------------------------------------------------------- /examples/Arabidopsis/src/TE_Density_Arabidopsis.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash -login 2 | 3 | #SBATCH -J Arabidopsis_Density 4 | #SBATCH --time=12:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=5 7 | #SBATCH --mem-per-cpu=8G 8 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Arabidopsis/Arabidopsis_Density.out 9 | #-------------------------------------------------------- 10 | echo "" 11 | echo "Job Information" 12 | echo "Job ID:" $SLURM_JOB_ID 13 | echo "" 14 | 15 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density 16 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/filtered_input_data/ 17 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Arabidopsis/ 18 | GENOME="Arabidopsis" 19 | 20 | 21 | # Load the Python on HPCC 22 | module purge 23 | module load GCC/10.2.0 Python/3.8.10 24 | 25 | # Source the Python packages that are version controlled 26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate 27 | 28 | # Go to project directory 29 | cd $ROOT_DIR 30 | 31 | # Make output dir 32 | mkdir -p $OUT_DIR 33 | 34 | # Run the code 35 | python process_genome.py $DATA_DIR/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv $DATA_DIR/Cleaned_TAIR10_chr_main_chromosomes.fas.mod.EDTA.TEanno.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini -n 5 -o $OUT_DIR 36 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/TE_Density_Glaberrima.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash -login 2 | 3 | #SBATCH -J Glaberrima_Density 4 | #SBATCH --time=24:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --mem-per-cpu=20G 8 | #SBATCH -o /mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Rice/Glaberrima_Density.out 9 | #-------------------------------------------------------- 10 | echo "" 11 | echo "Job Information" 12 | echo "Job ID:" $SLURM_JOB_ID 13 | echo "" 14 | 15 | ROOT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Density 16 | DATA_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/filtered_input_data/ 17 | OUT_DIR=/mnt/research/edgerpat_lab/Scotty/TE_Data/Example_Rice/ 18 | GENOME="Glaberrima" 19 | 20 | 21 | # Load the Python on HPCC 22 | module purge 23 | module load GCC/10.2.0 Python/3.8.10 24 | 25 | # Source the Python packages that are version controlled 26 | source /mnt/research/edgerpat_lab/Scotty/venvs/TE_Density/bin/activate 27 | 28 | # Go to project directory 29 | cd $ROOT_DIR 30 | 31 | # Make output dir 32 | mkdir -p $OUT_DIR 33 | 34 | # Run the code 35 | python process_genome.py $DATA_DIR/Cleaned_Oryza_glaberrima.Oryza_glaberrima_V1.50.tsv $DATA_DIR/Cleaned_Oryza_Glaberrima_NewNames.fasta.mod.EDTA.TEanno.tsv $GENOME -c $ROOT_DIR/config/production_run_config.ini -n 12 --reset_h5 -o $OUT_DIR 36 | -------------------------------------------------------------------------------- /transposon/test_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Sundry functions intended for testing and development. 5 | """ 6 | 7 | import h5py 8 | 9 | import pytest 10 | import tempfile 11 | 12 | 13 | @pytest.fixture 14 | def temp_dir(): 15 | """Yields a temporary directory.""" 16 | 17 | # NOTE using a scope of 'module' doesn't appear to work 18 | # if you import this, so default scope is used 19 | with tempfile.TemporaryDirectory() as dir: 20 | yield dir 21 | 22 | 23 | @pytest.fixture 24 | def temp_h5_file(temp_dir): # FUTURE could be a contextmanager rather than fixture 25 | """Yields a temporary HDF5 file.""" 26 | 27 | file = tempfile.NamedTemporaryFile(dir=temp_dir, suffix=".h5") 28 | with file as temp: 29 | yield temp.name 30 | 31 | 32 | @pytest.fixture 33 | def temp_h5_context(): # FUTURE could be a contextmanager rather than fixture 34 | """Yields an open HDF5 file, writes on close.""" 35 | 36 | # MAGIC 1KB as a reasonable default 37 | with tempfile.SpooledTemporaryFile(max_size=1024*1024) as temp: 38 | # MAGIC h5py convention, 'a' is append 39 | with h5py.File(temp, 'a') as file: 40 | yield file 41 | file.flush() # NOTE likely don't need this 42 | file.close() # NOTE likely don't need this 43 | -------------------------------------------------------------------------------- /examples/Blueberry_Expression/Makefile: -------------------------------------------------------------------------------- 1 | # scripts for running blueberry TE Density examples 2 | # __file__ Makefile 3 | # __author__ Scott Teresi 4 | 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 6 | DEV_DATA := $(realpath $(ROOT_DIR)/../../../TE_Data/Example_Blueberry) 7 | DEV_GENES := $(DEV_DATA)/Genes/Blueberry_Genes.gff 8 | DEV_TES := $(DEV_DATA)/TEs/Blueberry_EDTA_TEs.gff 9 | DEV_FILTERED := $(DEV_DATA)/filtered_input_data 10 | DEV_GENE_EXPRESSION := $(DEV_DATA)/Genes/Blueberry_TPM_All.tsv 11 | DEV_DENSITY_FILES := $(DEV_DATA)/../../TE_Data/Example_Blueberry/ 12 | 13 | filter_genes: 14 | @echo Filtering blueberry genes into appropriate format for TE Density 15 | @echo 16 | python $(ROOT_DIR)/src/import_blueberry_gene_anno.py $(DEV_GENES) $(DEV_FILTERED) 17 | 18 | filter_TEs: 19 | @echo Filtering blueberry TEs into appropriate format for TE Density 20 | @echo 21 | python $(ROOT_DIR)/src/import_blueberry_EDTA.py $(DEV_TES) $(DEV_FILTERED) 22 | 23 | calculate_TE_Density: 24 | @echo Running TE Density for blueberry 25 | sbatch $(ROOT_DIR)/src/TE_Density_Blueberry.sb 26 | 27 | generate_expression_graphs: 28 | @echo Generating TE density vs. gene expression graphs 29 | @echo 30 | mkdir -p $(ROOT_DIR)/results/graphs 31 | python $(ROOT_DIR)/src/compare_expression.py $(DEV_GENE_EXPRESSION) $(DEV_GENES) $(DEV_DENSITY_FILES) -o $(ROOT_DIR)/results/graphs 32 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # scripts for development 2 | # __file__ Makefile 3 | # __author__ Michael Teresi 4 | 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 6 | 7 | SYS_TEST_DIR := tests/system_test_input_data 8 | SYS_TEST_GENES := $(ROOT_DIR)/$(SYS_TEST_DIR)/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv 9 | SYS_TEST_TES := $(ROOT_DIR)/$(SYS_TEST_DIR)/Cleaned_TAIR10_chr_main_chromosomes.fas.mod.EDTA.TEanno.tsv 10 | 11 | 12 | .PHONY: help 13 | help: ## Show this help 14 | @grep -E '^[a-z_A-Z0-9^.(]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 15 | 16 | 17 | .PHONY: system_test 18 | system_test: ## run system test on sample data 19 | mkdir -p ./tmp 20 | python $(ROOT_DIR)/process_genome.py $(SYS_TEST_GENES) $(SYS_TEST_TES) Test -o ./tmp 21 | 22 | 23 | .PHONY: system_clean 24 | system_clean: ## clean the system test 25 | rm -rf ./tmp 26 | 27 | 28 | .PHONY: test 29 | test: ## run the tests 30 | mkdir -p $(ROOT_DIR)/tests/test_h5_cache_loc 31 | mkdir -p $(ROOT_DIR)/tests/output_data 32 | pytest $(ROOT_DIR) 33 | 34 | 35 | .PHONY: flake8 36 | flake8: ## run style guide 37 | flake8 $(ROOT_DIR) 38 | 39 | 40 | .PHONY: lint 41 | lint: ## run linter 42 | pylint $(ROOT_DIR)/transposon 43 | 44 | 45 | .PHONY: tags 46 | tags: ## run ctags 47 | ctags \ 48 | $(ROOT_DIR)/*.py \ 49 | $(ROOT_DIR)/transposon/*.py 50 | -------------------------------------------------------------------------------- /tests/unit/test_OverlapManager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Unit test OverlapManager. 5 | """ 6 | 7 | __author__ = "Michael Teresi" 8 | 9 | import pytest 10 | import os 11 | import tempfile 12 | 13 | from transposon.overlap_manager import OverlapManager 14 | from transposon.transposon_data import TransposonData 15 | from transposon.gene_data import GeneData 16 | 17 | N_SUBGENES = 4 # MAGIC arbitrary, enough for testing 18 | GENOME_ID = "FAKE_GENOME_ID" 19 | 20 | 21 | @pytest.fixture(scope="session") 22 | def temp_dir(): 23 | """Temporary directory.""" 24 | 25 | with tempfile.TemporaryDirectory() as dir: 26 | yield dir 27 | 28 | 29 | @pytest.fixture(scope="session") 30 | def temp_filenames(temp_dir): 31 | names = [next(tempfile._get_candidate_names()) for _ in range(N_SUBGENES)] 32 | paths = [os.path.join(temp_dir, n) for n in names] 33 | return paths 34 | 35 | 36 | @pytest.fixture(scope="session") 37 | def sub_genes(temp_filenames): 38 | 39 | genes = [GeneData.mock(genome_id=GENOME_ID) for _ in range(len(temp_filenames))] 40 | return [gene.write(path) for gene, path in zip(genes, temp_filenames)] 41 | 42 | 43 | @pytest.fixture(scope="session") 44 | def sub_transposons(temp_filenames): 45 | 46 | genes = [ 47 | TransposonData.mock(genome_id=GENOME_ID) for _ in range(len(temp_filenames)) 48 | ] 49 | return [gene.write(path) for gene, path in zip(genes, temp_filenames)] 50 | 51 | 52 | # def test_init_nothrow(sub_genes, sub_transposons): 53 | # pass 54 | 55 | 56 | if __name__ == "__main__": 57 | pytest.main(["-s", __file__]) # for convenience 58 | -------------------------------------------------------------------------------- /tests/input_data/Test_SingleC_MultiElongate_Superfam_Revision.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 3 | Fvb1-1 3795.0 4215.0 - LTR Copia 420.0 4 | Fvb1-1 4209.0 4761.0 - LTR Copia 553.0 5 | Fvb1-1 4670.0 5229.0 - LTR Gypsy 360.0 6 | Fvb1-1 8459.0 9266.0 - Completely_Unknown Completely_Unknown 808.0 7 | Fvb1-1 8794.0 9370.0 + Completely_Unknown Completely_Unknown 577.0 8 | Fvb1-1 9256.0 9677.0 + Completely_Unknown Completely_Unknown 421.0 9 | Fvb1-1 9678.0 11249.0 - Completely_Unknown Completely_Unknown 1572.0 10 | Fvb1-1 11248.0 11469.0 + Completely_Unknown Completely_Unknown 220.0 11 | Fvb1-1 11450.0 11886.0 + Completely_Unknown Completely_Unknown 298.0 12 | Fvb1-1 12193.0 12404.0 - TIR hAT 212.0 13 | Fvb1-1 13625.0 13774.0 + Completely_Unknown Completely_Unknown 150.0 14 | Fvb1-1 13799.0 13892.0 - TIR Mutator 94.0 15 | Fvb1-1 13892.0 14156.0 + Helitron Helitron 265.0 16 | Fvb1-1 14154.0 15323.0 + Helitron Helitron 1170.0 17 | Fvb1-1 15320.0 16343.0 - Helitron Helitron 1114.0 18 | Fvb1-1 16344.0 16966.0 - LTR Gypsy 623.0 19 | Fvb1-1 17033.0 17209.0 - Completely_Unknown Completely_Unknown 177.0 20 | Fvb1-1 20215.0 20599.0 + TIR hAT 385.0 21 | Fvb1-1 20236.0 20466.0 - LTR Unknown_LTR_Superfam 231.0 22 | Fvb1-1 20465.0 20899.0 + LTR Unknown_LTR_Superfam 435.0 23 | Fvb1-1 20880.0 22846.0 - LTR Unknown_LTR_Superfam 1966.0 24 | Fvb1-1 23216.0 23346.0 + Completely_Unknown Completely_Unknown 131.0 25 | Fvb1-1 23271.0 23459.0 - TIR Mutator 189.0 26 | Fvb1-1 23456.0 24419.0 - TIR CACTA 963.0 27 | Fvb1-1 24415.0 24625.0 - TIR CACTA 211.0 28 | Fvb1-1 24000.0 24500.0 - DNA MULE 501.0 29 | Fvb1-1 24200.0 24550.0 - LTR Copia 351.0 30 | -------------------------------------------------------------------------------- /tests/input_data/Test_SingleC_SingleElongate_Superfam_Revision.tsv: -------------------------------------------------------------------------------- 1 | Chromosome Start Stop Strand Order SuperFamily Length 2 | Fvb1-1 3350.0 3957.0 - LTR Copia 608.0 3 | Fvb1-1 3795.0 4133.0 - LTR Copia 339.0 4 | Fvb1-1 4209.0 4761.0 - LTR Gypsy 553.0 5 | Fvb1-1 4670.0 5229.0 - LTR Gypsy 560.0 6 | Fvb1-1 8459.0 9266.0 - Completely_Unknown Completely_Unknown 808.0 7 | Fvb1-1 8794.0 9370.0 + Completely_Unknown Completely_Unknown 577.0 8 | Fvb1-1 9556.0 9677.0 + Completely_Unknown Completely_Unknown 122.0 9 | Fvb1-1 9675.0 11249.0 - Completely_Unknown Completely_Unknown 1575.0 10 | Fvb1-1 11253.0 11469.0 + Completely_Unknown Completely_Unknown 217.0 11 | Fvb1-1 11450.0 11886.0 + Completely_Unknown Completely_Unknown 437.0 12 | Fvb1-1 12193.0 12404.0 - TIR hAT 212.0 13 | Fvb1-1 13625.0 13774.0 + Completely_Unknown Completely_Unknown 150.0 14 | Fvb1-1 13799.0 13892.0 - TIR Mutator 94.0 15 | Fvb1-1 13892.0 14156.0 + Helitron Helitron 265.0 16 | Fvb1-1 14154.0 15323.0 + Helitron Helitron 1170.0 17 | Fvb1-1 15397.0 16343.0 - LTR Unknown_LTR_Superfam 947.0 18 | Fvb1-1 16344.0 16966.0 - LTR Gypsy 623.0 19 | Fvb1-1 17033.0 17209.0 - Completely_Unknown Completely_Unknown 177.0 20 | Fvb1-1 20215.0 20599.0 + TIR hAT 385.0 21 | Fvb1-1 20236.0 20464.0 - LTR Unknown_LTR_Superfam 229.0 22 | Fvb1-1 20465.0 20899.0 + LTR Unknown_LTR_Superfam 435.0 23 | Fvb1-1 20880.0 22846.0 - LTR Unknown_LTR_Superfam 1966.0 24 | Fvb1-1 23216.0 23346.0 + Completely_Unknown Completely_Unknown 131.0 25 | Fvb1-1 23271.0 23459.0 - TIR Mutator 189.0 26 | Fvb1-1 23456.0 24419.0 - TIR CACTA 963.0 27 | Fvb1-1 24415.0 24625.0 - TIR CACTA 211.0 28 | Fvb1-1 24000.0 24500.0 - DNA MULE 501.0 29 | Fvb1-1 24200.0 24550.0 - LTR Copia 351.0 30 | -------------------------------------------------------------------------------- /tests/unit/test_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Unit test data.py 5 | """ 6 | 7 | __author__ = "Michael Teresi" 8 | 9 | import pytest 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | from transposon.gene_data import GeneData 15 | from transposon.gene_datum import GeneDatum 16 | from transposon.transposon_data import TransposonData 17 | 18 | @pytest.fixture 19 | def gene_data(): 20 | """Default GeneData instance.""" 21 | 22 | return GeneData.mock() 23 | 24 | def test_init(gene_data): 25 | """Does the initializer fail?""" 26 | 27 | pass 28 | 29 | def test_subset_id_unique(): 30 | """Does the chromosome identifier work if the IDs are the same?""" 31 | 32 | genes = GeneData.mock(np.array([[0, 9], [10, 19]])) 33 | same_chromosome_name = "this_name_is_consistent" 34 | genes.chromosomes[0] = same_chromosome_name 35 | genes.chromosomes[1] = same_chromosome_name 36 | assert genes.chromosome_unique_id == same_chromosome_name 37 | 38 | def test_subset_id_missing(): 39 | """Does the chromosome identifier raise if the IDs are missing?""" 40 | 41 | genes = GeneData.mock(np.array([])) 42 | with pytest.raises(RuntimeError) as excinfo: 43 | genes.chromosome_unique_id 44 | 45 | def test_subset_id_not_unique(): 46 | """Does the property raise if the chromosome IDs aren't unique?""" 47 | 48 | genes = GeneData.mock(np.array([[0, 9], [10, 19]])) 49 | genes.chromosomes[0] = "first_not_unique_chromosome_name" 50 | genes.chromosomes[1] = "this_one_is_different" 51 | with pytest.raises(RuntimeError) as excinfo: 52 | genes.chromosome_unique_id 53 | 54 | 55 | if __name__ == "__main__": 56 | pytest.main(['-s', __file__]) # for convenience 57 | -------------------------------------------------------------------------------- /examples/Human/Makefile: -------------------------------------------------------------------------------- 1 | # scripts for running human TE Density examples 2 | # __file__ Makefile 3 | # __author__ Scott Teresi 4 | 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 6 | DEV_GFF_READ_EXECUTABLE := /home/scott/Documents/Uni/Research/gffread 7 | #DEV_GFF_READ_EXECUTABLE := /mnt/research/edgerpat_lab/Scotty/gffread 8 | DEV_DATA := $(realpath $(ROOT_DIR)/../../../TE_Density_Example_Data/Human) 9 | #DEV_GENES := $(DEV_DATA)/Genes/Chr7_13_Human_Genes.tsv 10 | DEV_GENES := $(DEV_DATA)/Genes/gencode.v37.annotation.gff3 11 | #DEV_TES := $(DEV_DATA)/TEs/Chr7_13_Human_TEs.tsv 12 | DEV_TES := $(DEV_DATA)/TEs/Human_TEs.tsv 13 | DEV_PROD_CONF := $(ROOT_DIR)/../../config/production_run_config.ini 14 | 15 | DEV_FILTERED := $(realpath $(ROOT_DIR)/../../../TE_Data/filtered_input_data) 16 | DEV_HDF5 := $(realpath $(ROOT_DIR)/../../../TE_Data/finalized_data/10KB) 17 | DEV_RESULTS:= $(realpath $(ROOT_DIR)/results) 18 | 19 | filter_genes: 20 | @echo Filtering human genes into appropriate format for TE Density 21 | python $(ROOT_DIR)/src/import_human_gene_anno.py $(DEV_GENES) 22 | 23 | filter_TEs: 24 | python $(ROOT_DIR)/src/import_human_te_anno.py $(DEV_TES) 25 | 26 | examine_TE_levels: 27 | @echo 28 | python $(ROOT_DIR)/src/retrieve_info_of_genes.py $(DEV_HDF5)/Human/Human_Chrom_Subset_chr7.h5 $(DEV_FILTERED)/Cleaned_Chr7_13_Human_Genes.tsv 7 Human_Chr7 $(ROOT_DIR)/list_of_chr7_genes.txt 29 | @echo 30 | python $(ROOT_DIR)/src/retrieve_info_of_genes.py $(DEV_HDF5)/Human/Human_Chrom_Subset_chr13.h5 $(DEV_FILTERED)/Cleaned_Chr7_13_Human_Genes.tsv 13 Human_Chr13 $(ROOT_DIR)/list_of_chr13_genes.txt 31 | help: ## Show this help. 32 | fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//' 33 | -------------------------------------------------------------------------------- /tests/unit/test_gene_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Unit test data.py 5 | """ 6 | 7 | __author__ = "Michael Teresi" 8 | 9 | import pytest 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | from transposon.gene_data import GeneData 15 | from transposon.gene_datum import GeneDatum 16 | from transposon.transposon_data import TransposonData 17 | 18 | @pytest.fixture 19 | def gene_data(): 20 | """Default GeneData instance.""" 21 | 22 | return GeneData.mock() 23 | 24 | def test_init(gene_data): 25 | """Does the initializer fail?""" 26 | 27 | pass 28 | 29 | def test_subset_id_unique(): 30 | """Does the chromosome identifier work if the IDs are the same?""" 31 | 32 | genes = GeneData.mock(np.array([[0, 9], [10, 19]])) 33 | same_chromosome_name = "this_name_is_consistent" 34 | genes.chromosomes[0] = same_chromosome_name 35 | genes.chromosomes[1] = same_chromosome_name 36 | assert genes.chromosome_unique_id == same_chromosome_name 37 | 38 | def test_subset_id_missing(): 39 | """Does the chromosome identifier raise if the IDs are missing?""" 40 | 41 | genes = GeneData.mock(np.array([])) 42 | with pytest.raises(RuntimeError) as excinfo: 43 | genes.chromosome_unique_id 44 | 45 | def test_subset_id_not_unique(): 46 | """Does the property raise if the chromosome IDs aren't unique?""" 47 | 48 | genes = GeneData.mock(np.array([[0, 9], [10, 19]])) 49 | genes.chromosomes[0] = "first_not_unique_chromosome_name" 50 | genes.chromosomes[1] = "this_one_is_different" 51 | with pytest.raises(RuntimeError) as excinfo: 52 | genes.chromosome_unique_id 53 | 54 | 55 | if __name__ == "__main__": 56 | pytest.main(['-s', __file__]) # for convenience 57 | -------------------------------------------------------------------------------- /tests/unit/test_transposon_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Unit test transposon_data.py 5 | """ 6 | 7 | __author__ = "Michael Teresi" 8 | 9 | import pytest 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | from transposon.gene_data import GeneData 15 | from transposon.gene_datum import GeneDatum 16 | from transposon.transposon_data import TransposonData 17 | 18 | @pytest.fixture 19 | def te_data(): 20 | """Default TransposonData instance.""" 21 | 22 | return TransposonData.mock() 23 | 24 | def test_init(te_data): 25 | """Does the initializer fail?""" 26 | 27 | pass 28 | 29 | def test_subset_id_unique(): 30 | """Does the chromosome identifier work if the IDs are the same?""" 31 | 32 | transposons = TransposonData.mock(np.array([[0, 9], [10, 19]])) 33 | same_chromosome_name = "this_name_is_consistent" 34 | transposons.chromosomes[0] = same_chromosome_name 35 | transposons.chromosomes[1] = same_chromosome_name 36 | assert transposons.chromosome_unique_id == same_chromosome_name 37 | 38 | def test_subset_id_missing(): 39 | """Does the chromosome identifier raise if the IDs are missing?""" 40 | 41 | transposons = TransposonData.mock(np.array([])) 42 | with pytest.raises(RuntimeError) as excinfo: 43 | transposons.chromosome_unique_id 44 | 45 | def test_subset_id_not_unique(): 46 | """Does the property raise if the chromosome IDs aren't unique?""" 47 | 48 | transposons = TransposonData.mock(np.array([[3, 9], [12, 25]])) 49 | transposons.chromosomes[0] = "first_not_unique_chromosome_name" 50 | transposons.chromosomes[1] = 'Different_chromosome_name_from_default' 51 | with pytest.raises(RuntimeError) as excinfo: 52 | transposons.chromosome_unique_id 53 | 54 | 55 | 56 | if __name__ == "__main__": 57 | pytest.main(['-s', __file__]) # for convenience 58 | -------------------------------------------------------------------------------- /tests/unit/test_WorkerProcess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Unit test WorkerProcess. 5 | """ 6 | 7 | __author__ = "Michael Teresi" 8 | 9 | 10 | from collections import namedtuple 11 | import coloredlogs 12 | import logging 13 | import os 14 | import pytest 15 | import multiprocessing 16 | 17 | from transposon.worker import WorkerProcess 18 | 19 | 20 | class FakeWorker(WorkerProcess): 21 | """Implementation of WorkerProcess for testing.""" 22 | 23 | def execute_job(self, job): 24 | """Arbitrary job to square an input.""" 25 | 26 | logging.debug("got {}".format(job)) 27 | return (job * job, os.getpid()) 28 | 29 | 30 | @pytest.fixture() 31 | def worker(): 32 | mgr = multiprocessing.Manager 33 | input = multiprocessing.Queue() 34 | output = multiprocessing.Queue() 35 | stop = multiprocessing.Event() 36 | yield FakeWorker(input, output, stop) 37 | 38 | @pytest.fixture() 39 | def worker_running(worker): 40 | """Yield a running process.""" 41 | worker.start() 42 | yield worker 43 | worker.stop_event.set() 44 | worker.join() 45 | 46 | def test_start_stop(worker): 47 | """Can we start / stop the worker?""" 48 | 49 | worker.start() 50 | worker.stop_event.set() 51 | worker.join() 52 | 53 | def test_answer(worker_running): 54 | """Do we get feedback?""" 55 | number = 4 56 | worker_running.input.put(4) 57 | answer, worker_pid = worker_running.output.get(timeout=1) 58 | assert answer == number**2 59 | 60 | def test_newprocess(worker_running): 61 | """Is the worker in a different process?""" 62 | 63 | worker_running.input.put(4) 64 | answer, worker_pid = worker_running.output.get(timeout=1) 65 | assert os.getpid() != worker_pid 66 | 67 | 68 | if __name__ == "__main__": 69 | logger = logging.getLogger(__name__) 70 | coloredlogs.install(level=logging.INFO) 71 | pytest.main(['-s', __file__]) # for convenience 72 | -------------------------------------------------------------------------------- /transposon/import_filtered_genes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from transposon import check_nulls, check_strand 3 | 4 | 5 | def import_filtered_genes(genes_input_path, logger): 6 | """ 7 | Import the preprocessed gene annotation file. Read it as pandas dataframe 8 | 9 | genes_input_path (str): Path to cleaned input annotation file of genes 10 | 11 | logger (logging.Logger): Logging object 12 | 13 | Returns: 14 | gene_data (pandas.core.frame.DataFrame): A pandas dataframe 15 | representing the preprocessed gene annotation file 16 | """ 17 | try: 18 | gene_data = pd.read_csv( 19 | genes_input_path, 20 | header="infer", 21 | sep="\t", 22 | dtype={ 23 | "Start": "float64", 24 | "Stop": "float64", 25 | "Length": "float64", 26 | "Chromosome": str, 27 | "Strand": str, 28 | "Feature": str, 29 | "Gene_Name": str, 30 | }, 31 | ) 32 | except Exception as err: 33 | msg = """ 34 | Error occurred while trying to read preprocessed gene 35 | annotation file into a Pandas dataframe, please refer 36 | to the README as to what information is expected 37 | input file: %s 38 | """ 39 | logger.critical(msg, genes_input_path) 40 | raise err 41 | 42 | gene_data.set_index("Gene_Name", verify_integrity=True, inplace=True) 43 | check_nulls(gene_data, logger) 44 | check_strand(gene_data, logger) 45 | 46 | # NOTE edit Strand '.' values to be sense orientation as 47 | # described in check_strand() 48 | gene_data["Strand"].replace(to_replace={".": "+"}, inplace=True) 49 | 50 | # Sort for legibility 51 | gene_data.sort_values(by=["Chromosome", "Start"], inplace=True) 52 | 53 | logger.info("import of preprocessed gene annotation... success!") 54 | return gene_data 55 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/generate_pairs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Master code file. Control filtration of syntelog data and generate summary 5 | table 6 | """ 7 | 8 | __author__ = "Scott Teresi" 9 | 10 | import argparse 11 | import os 12 | import logging 13 | import coloredlogs 14 | 15 | from import_syntelogs import import_syntelogs 16 | from transposon import check_nulls 17 | 18 | 19 | def process( 20 | syntelog_input_file, 21 | data_output_path, 22 | ): 23 | # Import the synteny data from raw file 24 | logger.info("Importing syntelogs: %s" % syntelog_input_file) 25 | syntelogs = import_syntelogs(syntelog_input_file) 26 | check_nulls(syntelogs, logger) 27 | 28 | # Wrap the data 29 | file_to_save = os.path.join(data_output_path, "set_syntelogs.tsv") 30 | logger.info("Writing syntelog data to disk: %s" % file_to_save) 31 | syntelogs.to_csv(file_to_save, sep="\t", header=True, index=False) 32 | 33 | 34 | if __name__ == "__main__": 35 | """Command line interface to link syntelogs together.""" 36 | 37 | parser = argparse.ArgumentParser(description="Filter syntelogs") 38 | path_main = os.path.abspath(__file__) 39 | parser.add_argument( 40 | "syntelog_input_file", type=str, help="parent path of syntelog file" 41 | ) 42 | parser.add_argument( 43 | "--output_directory", 44 | "-o", 45 | type=str, 46 | help="parent path of output directory", 47 | default=os.path.join(path_main, "../../../../examples/Rice_Synteny/results"), 48 | ) 49 | 50 | parser.add_argument( 51 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 52 | ) 53 | 54 | args = parser.parse_args() 55 | args.syntelog_input_file = os.path.abspath(args.syntelog_input_file) 56 | args.output_directory = os.path.abspath(args.output_directory) 57 | log_level = logging.DEBUG if args.verbose else logging.INFO 58 | logger = logging.getLogger(__name__) 59 | coloredlogs.install(level=log_level) 60 | 61 | # Process 62 | logger.info("Starting filtration...") 63 | process( 64 | args.syntelog_input_file, 65 | args.output_directory, 66 | ) 67 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples: 2 | The purpose of this repository is to demonstrate the usage of TE Density on multiple different genomes as well as show interesting application and data visualization opportunities using the TE Density's output data sets. 3 | The makefile in each genome-specific directory contains the sequence of commands we used to perform each analysis. 4 | 5 | 6 | ## Example 1 - Genome-Wide Trends of TE Presence Relative to Genes (Arabidopsis): 7 | The TE Density toolkit may be used to investigate the relationship of genes and TE presence genome-wide. 8 | Here, we create a dotplot of the average TE density values of all genes by TE type, upstream, intragenically, and downstream for chromosome 1 of the Arabidopsis genome. 9 | 10 | ### Step 1 - Create a CDS FASTA file for use in creating a TE annotation: 11 | Please reference the makefile command `create_CDS`, here we use the GFFRead tool to generate a CDS FASTA file which is useful in creating a TE annotation with EDTA. 12 | 13 | ### Step 2 - Generate a TE Annotation with EDTA: 14 | Please reference the makefile command `run_EDTA_HPCC`, here we run EDTA on MSU's high-performance computing cluster (HPCC). 15 | 16 | ### Step 3 - Preprocess each annotation file prior to running TE Density: 17 | Please reference the makefile commands `filter_genes` and `filter_TEs`, here we use the Python package Pandas to perform reformat the annotation files. This part will likely need to be custom tailored to the user's own annotation files. 18 | 19 | ### Step 4 - Run TE Density: 20 | Please reference the makefile command `run_TE_Density_HPCC`, here we call `process_genome.py` for the Arabidopsis genome but do so in an SBATCH script because we are using the resources of the HPCC. It is better to err on the side of more RAM if the genome is taking too long to compute, it may have stalled out due to insufficient RAM. 21 | 22 | ### Step 5 - Begin analysis of TE Density data: 23 | #### Generate dotplots of average TE Density values for all genes as the window changes: 24 | Please reference the makefile command `generate_dotplots` and its python file `generate_dotplots.py`. Here we initialize the DensityData class to aid in accessing the data in the output HDF5 files. 25 | 26 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/README.md: -------------------------------------------------------------------------------- 1 | # TE Density Comparisons of Syntelogs in Rice: 2 | This README contains information pertaining to how we compared TE Density levels of syntelogs using two closely related rice genomes. Please refer to the `Makefile` for our explicit commands we used from start to finish in creating and using the data relevant to this example. 3 | 4 | # Downloading of Rice FASTA and Gene Annotation Files: 5 | 6 | # Generating a TE Annotation for Rice using EDTA: 7 | First, we need to generate a TE annotation for the two rice genomes. We do this by creating a CDS FASTA for each genome and then fixing (shortening) the names of the sequences in both the regular FASTA and CDS FASTA to run EDTA without any warnings (it does not like long sequence ID names). 8 | We use three main code blocks in the `Makefile` to get the necessary files prior to running EDTA: 9 | 1. `create_CDS` utilizes `gffread` to create a CDS FASTA file using the regular FASTA file and a gene annotation. 10 | 2. `fix_fasta_names` alters the sequence ID names in the FASTA file so that they aren't too long for EDTA. 11 | 3. `fix_CDS_names` alters the sequence ID names in the FASTA file so that they aren't too long for EDTA. 12 | 4. `run_EDTA_HPCC` executes the commands required to run EDTA on the HPCC (the computing cluster of MSU). Please refer to the `Annotate_EDTA_Rice_Glaberrima.sb` and `Annotate_EDTA_Rice_Sativa.sb` files to see the commands and resources we used in generating the EDTA annotations. 13 | 14 | # Perform the Pre-processing Steps of TE Density, Filter Both Annotation Files: 15 | Here please refer to `filter_genes` and `filter_TEs` to view the commands we invoked to pre-process our data. The `import_rice_gene_anno.py` and `import_rice_EDTA.py` are the primary scripts the user will need to edit for their own purposes. 16 | 1. Rice gene annotation... 17 | 18 | 19 | 20 | # Running SynMap: 21 | This section describes the methods to run [SynMap](https://genomevolution.org/CoGe/SynMap.pl) on CoGe. I ran SynMap with mostly [default options](https://genomevolution.org/wiki/index.php/SynMap), I did change one option: under *Merge Syntenic Blocks* I set it to `Quota Align Merge`. Here is the [link](https://genomevolution.org/r/1how2) for *Glaberrima vs Sativa*. 22 | -------------------------------------------------------------------------------- /examples/Blueberry_Expression/EDTA_Blueberry.out: -------------------------------------------------------------------------------- 1 | 2 | ######################################################## 3 | ##### Extensive de-novo TE Annotator (EDTA) v1.9.7 #### 4 | ##### Shujun Ou (shujun.ou.1@gmail.com) #### 5 | ######################################################## 6 | 7 | 8 | 9 | Fri Mar 19 14:21:25 EDT 2021 Dependency checking: 10 | All passed! 11 | 12 | Fri Mar 19 14:22:40 EDT 2021 The longest sequence ID in the genome contains 18 characters, which is longer than the limit (15) 13 | Trying to reformat seq IDs... 14 | Attempt 1... 15 | Attempt 2... 16 | Fri Mar 19 14:23:25 EDT 2021 Seq ID conversion successful! 17 | 18 | A CDS file /mnt/research/edgerpat_lab/Scotty/Blueberry_Data/Genome/Vacc_c_CoGe_CDS.fasta is provided via --cds. Please make sure this is the DNA sequence of coding regions only. 19 | 20 | Fri Mar 19 14:24:05 EDT 2021 Obtain raw TE libraries using various structure-based programs: 21 | Fri Mar 19 14:24:24 EDT 2021 Obtain raw TE libraries finished. 22 | All intact TEs found by EDTA: 23 | Vaccinium_corymbosum.faa.mod.EDTA.intact.fa 24 | Vaccinium_corymbosum.faa.mod.EDTA.intact.gff3 25 | 26 | Fri Mar 19 14:24:24 EDT 2021 Perform EDTA advance filtering for raw TE candidates and generate the stage 1 library: 27 | 28 | Fri Mar 19 16:50:16 EDT 2021 EDTA advance filtering finished. 29 | 30 | Fri Mar 19 16:50:16 EDT 2021 Perform EDTA final steps to generate a non-redundant comprehensive TE library: 31 | 32 | Use RepeatModeler to identify any remaining TEs that are missed by structure-based methods. 33 | 34 | Mon Mar 22 05:38:12 EDT 2021 Clean up TE-related sequences in the CDS file with TEsorter: 35 | 36 | Remove CDS-related sequences in the EDTA library: 37 | 38 | Mon Mar 22 09:24:38 EDT 2021 EDTA final stage finished! You may check out: 39 | The final EDTA TE library: Vaccinium_corymbosum.faa.mod.EDTA.TElib.fa 40 | Mon Mar 22 09:24:38 EDT 2021 Perform post-EDTA analysis for whole-genome annotation: 41 | 42 | Mon Mar 22 17:03:06 EDT 2021 TE annotation using the EDTA library has finished! Check out: 43 | Whole-genome TE annotation (total TE: 46.50%): Vaccinium_corymbosum.faa.mod.EDTA.TEanno.gff3 44 | Whole-genome TE annotation summary: Vaccinium_corymbosum.faa.mod.EDTA.TEanno.sum 45 | Low-threshold TE masking for MAKER gene annotation (masked: 14.91%): Vaccinium_corymbosum.faa.mod.MAKER.masked 46 | 47 | -------------------------------------------------------------------------------- /transposon/import_filtered_TEs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from transposon import check_nulls 3 | 4 | 5 | def import_filtered_TEs(tes_input_path, logger): 6 | """ 7 | Import a pre-filtered TE file to the pipeline 8 | 9 | Args: 10 | tes_input_path (str): String to path of pre-filtered TE file 11 | 12 | logger (logging.Logger): object to log messages 13 | 14 | Returns: 15 | transposon_data (pandas.core.frame.DataFrame): A pandas dataframe 16 | representing the preprocessed transposon annotation file 17 | """ 18 | try: 19 | transposon_data = pd.read_csv( 20 | tes_input_path, 21 | header="infer", 22 | sep="\t", 23 | dtype={ 24 | "Start": "float64", 25 | "Stop": "float64", 26 | "Length": "float64", 27 | "Chromosome": str, 28 | "Strand": str, 29 | "Order": str, 30 | "SuperFamily": str, 31 | }, 32 | ) 33 | except Exception as err: 34 | msg = ( 35 | "Error occurred while trying to read preprocessed TE " 36 | "annotation file into a Pandas dataframe, please refer " 37 | "to the README as to what information is expected" 38 | ) 39 | logger.critical(msg) 40 | raise err 41 | 42 | # Check for missing data issues 43 | check_nulls(transposon_data, logger) 44 | 45 | # Report out to user some quick data metrics 46 | logger.info(diagnostic_cleaner_helper(transposon_data)) 47 | 48 | # Sort for legibility 49 | transposon_data.sort_values(by=["Chromosome", "Start"], inplace=True) 50 | 51 | logger.info("import of pre-filtered transposon annotation... success!") 52 | 53 | return transposon_data 54 | 55 | 56 | def diagnostic_cleaner_helper(TE_Data): 57 | info = f""" 58 | --------------------------------- 59 | Filtered TE Annotation Information: 60 | No. unique chromosomes: {len(TE_Data.Chromosome.unique())} 61 | Unique chromosomes: {TE_Data.Chromosome.unique()} 62 | 63 | No. unique TE Orders: {len(TE_Data.Order.unique())} 64 | Unique TE Orders: {TE_Data.Order.unique()} 65 | 66 | No. unique TE superfamilies: {len(TE_Data.SuperFamily.unique())} 67 | Unique TE superfamilies: {TE_Data.SuperFamily.unique()} 68 | --------------------------------- 69 | """ 70 | return info 71 | -------------------------------------------------------------------------------- /tests/unit/test_preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Test preprocess 5 | """ 6 | 7 | __author__ = "Scott Teresi" 8 | 9 | import pytest 10 | import tempfile 11 | import pandas as pd 12 | 13 | from transposon.gene_data import GeneData 14 | from transposon.transposon_data import TransposonData 15 | from transposon.preprocess import PreProcessor 16 | 17 | 18 | @pytest.fixture 19 | def gene_file(request): 20 | """path to simple gene annotation file for testing.""" 21 | return request.param 22 | 23 | 24 | @pytest.fixture 25 | def transposon_file(request): 26 | """ 27 | path to simple TE annotation file for testing. This file is special in 28 | that it does not have corresponding chromosome IDs to the gene file. 29 | """ 30 | return request.param 31 | 32 | 33 | @pytest.fixture() 34 | def temp_dir(): 35 | """Temporary directory.""" 36 | 37 | with tempfile.TemporaryDirectory() as dir: 38 | yield dir 39 | 40 | 41 | @pytest.mark.parametrize( 42 | "gene_file", 43 | [ 44 | "tests/input_data/Test_Preprocess_Cleaned_Genes.tsv", 45 | ], 46 | ) 47 | @pytest.mark.parametrize( 48 | "transposon_file", 49 | [ 50 | "tests/input_data/Test_Preprocess_Cleaned_TEs_Unequal_Chrom_IDs.tsv", 51 | "tests/input_data/Test_Preprocess_Cleaned_TEs_Unequal_Number_Chrom.tsv", 52 | ], 53 | ) 54 | def test_validate_split(gene_file, transposon_file, temp_dir): 55 | """ 56 | Does the program fail elegantly when you give it BAD input annotations that 57 | do not have the right corresponding chromosomes? 58 | 59 | Args: 60 | gene_file (str): string to filepath of a "cleaned" gene annotation 61 | transposon_file (str): string to filepath of a "cleaned" TE annotation 62 | temp_dir (str): string to filepath for temporary output dir 63 | """ 64 | preprocessor_obj = PreProcessor( 65 | gene_file, 66 | transposon_file, 67 | temp_dir, 68 | False, # reset h5 arg 69 | "fake_genome_id", # MAGIC genome ID arg 70 | True, # revise TE arg 71 | ) 72 | gene_frame = preprocessor_obj._load_filtered_genes() 73 | transposon_frame = preprocessor_obj._load_filtered_transposons() 74 | transposon_frame = preprocessor_obj._revise_transposons(transposon_frame) 75 | with pytest.raises(ValueError) as exc: 76 | preprocessor_obj._split_wrt_chromosome(gene_frame, transposon_frame) 77 | 78 | 79 | if __name__ == "__main__": 80 | pytest.main(["-s", __file__]) 81 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/bargraphs.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.patches as mpatches 3 | import os 4 | 5 | 6 | def graph_barplot_density_differences( 7 | values, 8 | te_type, 9 | window_val, 10 | direction, 11 | number_of_zeros, 12 | output_dir, 13 | logger, 14 | display=False, 15 | align="left", 16 | ): 17 | """ 18 | Plot a histogram of TE density differences between syntelog pairs 19 | 20 | Args: 21 | values (list): A list of values representing the TE density differences 22 | between syntelog pairs 23 | 24 | te_type (str): String representing the TE type being plotted 25 | 26 | window_val (int): Integer representing the current window of which the 27 | data is being plotted 28 | 29 | direction (str): string representing whether or not the graphs are 30 | coming from upstream or downstream TE density data 31 | 32 | number_of_zeros (): 33 | 34 | logger (logging.Logger): Object to log information to 35 | 36 | display (boolean): Defaults to False, if True shows the plot upon 37 | generation with the plt.show() command 38 | """ 39 | 40 | # MAGIC, the bins to group density values for the histogram AND the values 41 | # for the xticks on the xaxis 42 | tick_bins = [ 43 | -1.0, 44 | -0.9, 45 | -0.8, 46 | -0.7, 47 | -0.6, 48 | -0.5, 49 | -0.4, 50 | -0.3, 51 | -0.2, 52 | -0.1, 53 | 0, 54 | 0.1, 55 | 0.2, 56 | 0.3, 57 | 0.4, 58 | 0.5, 59 | 0.6, 60 | 0.7, 61 | 0.8, 62 | 0.9, 63 | 1.0, 64 | ] 65 | 66 | plt.figure(figsize=(8, 6)) 67 | n, bins, patches = plt.hist( 68 | values, bins=tick_bins, facecolor="blue", ec="black", alpha=0.5 69 | ) 70 | plt.rcParams["xtick.labelsize"] = 7 # MAGIC set size of axis ticks 71 | plt.ylabel("Number of Genes") 72 | plt.xlabel("Difference in TE Density Values") 73 | plt.title("O. glaberrima vs O. sativa") # MAGIC genome name order here 74 | N = mpatches.Patch( 75 | label="Total Plotted Genes: %s \nTE type: %s \nWindow: %s \nDirection: %s \nNo. 0 Differences: %s" 76 | % (len(values), te_type, window_val, direction, str(number_of_zeros)) 77 | ) 78 | plt.xticks(tick_bins) 79 | plt.legend(handles=[N]) 80 | path = os.path.join( 81 | output_dir, 82 | (te_type + "_" + str(window_val) + "_" + direction + "_DensityDifferences.png"), 83 | ) 84 | logger.info("Saving graph to: %s" % path) 85 | plt.savefig(path) 86 | if display: 87 | plt.show() 88 | plt.close() 89 | -------------------------------------------------------------------------------- /examples/Arabidopsis/Makefile: -------------------------------------------------------------------------------- 1 | # scripts for running rice synteny TE Density examples 2 | # __file__ Makefile 3 | # __author__ Scott Teresi 4 | 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 6 | DEV_GFF_READ_EXECUTABLE := /home/scott/Documents/Uni/Research/gffread 7 | #DEV_GFF_READ_EXECUTABLE := /mnt/research/edgerpat_lab/Scotty/gffread 8 | DEV_DATA := $(realpath $(ROOT_DIR)/../../../TE_Density_Example_Data/Arabidopsis) 9 | DEV_ARAB_FASTA_DATA := $(DEV_DATA)/Sequences/TAIR10_chr_main_chromosomes.fas 10 | DEV_ARAB_GENES := $(DEV_DATA)/Genes/TAIR10_GFF3_genes_main_chromosomes.gff 11 | DEV_ARAB_EXP := $(DEV_DATA)/Genes/At-Expression_AtBrCv_4Wisecaver.csv 12 | DEV_ARAB_TEs := $(DEV_DATA)/TEs/TAIR10_chr_main_chromosomes.fas.mod.EDTA.TEanno.gff3 13 | DEV_FILTERED := $(realpath $(ROOT_DIR)/../../../TE_Data/filtered_input_data) 14 | DEV_HDF5 := $(realpath $(ROOT_DIR)/../../../TE_Data/finalized_data/10KB/Arabidopsis) 15 | DEV_RESULTS:= $(realpath $(ROOT_DIR)/results) 16 | 17 | .PHONY: create_CDS fix_fasta_names fix_CDS_names 18 | 19 | create_CDS: 20 | @echo 21 | @echo Creating CDS from GFF and fasta file for Arabidopsis 22 | $(DEV_GFF_READ_EXECUTABLE)/gffread -x $(DEV_DATA)/Sequences/Arabidopsis_CDS.fasta -g $(DEV_ARAB_FASTA_DATA) $(DEV_ARAB_GENES) 23 | @echo 24 | 25 | run_EDTA_HPCC: 26 | @echo Running EDTA for Arabidopsis 27 | sbatch $(ROOT_DIR)/src/Annotate_EDTA_Arabidopsis_thaliana.sb 28 | 29 | filter_genes: 30 | @echo Filtering Arabidopsis genes into appropriate format for TE Density 31 | python $(ROOT_DIR)/src/import_Arabidopsis_gene_anno.py $(DEV_ARAB_GENES) 32 | 33 | filter_TEs: 34 | @echo Filtering Arabidopsis TEs into appropriate format for TE Density 35 | python $(ROOT_DIR)/src/import_Arabidopsis_EDTA.py $(DEV_ARAB_TEs) 36 | 37 | run_TE_Density_HPCC: 38 | @echo Running TE Density for Glaberrima 39 | @echo sbatch file contains paths inside 40 | sbatch $(ROOT_DIR)/src/TE_Density_Arabidopsis.sb 41 | 42 | generate_dotplots: 43 | @echo Generating TE density dotplot for Arabidopsis 44 | mkdir -p $(DEV_RESULTS)/graphs 45 | python $(ROOT_DIR)/src/generate_dotplots.py $(DEV_HDF5)/Arabidopsis_Chr1.h5 $(DEV_FILTERED)/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv 1 -o $(DEV_RESULTS)/graphs 46 | 47 | compare_density_upstream_downstream: 48 | @echo Comparing upstream and downstream density arrays with chi-squared 49 | mkdir -p $(DEV_RESULTS)/chi_squared 50 | python $(ROOT_DIR)/src/chi_squared_density_comparison.py $(DEV_FILTERED)/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv $(DEV_HDF5) -o $(DEV_RESULTS)/chi_squared 51 | 52 | compare_centromeric: 53 | @echo Comparing centromeric/pericentromeric and regular density arrays 54 | mkdir -p $(DEV_RESULTS)/centromeric 55 | python $(ROOT_DIR)/src/compare_centromeric_densities.py $(DEV_FILTERED)/Cleaned_TAIR10_GFF3_genes_main_chromosomes.tsv $(DEV_HDF5) $(DEV_ARAB_EXP) -o $(DEV_RESULTS)/centromeric 56 | -------------------------------------------------------------------------------- /transposon/worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Generic consumer / producer with Process. 5 | 6 | Inherit and implement `Worker.execute_job`. 7 | Implement your 'job' container, enqueue to the job queue. 8 | Consumes jobs from the job queue, passes them to `execute_job`. 9 | Stop processing using the Sentinel and / or stop event. 10 | """ 11 | 12 | from abc import ABC, abstractmethod 13 | from collections import namedtuple 14 | import logging 15 | from multiprocessing import Process 16 | import queue 17 | 18 | Sentinel = namedtuple("Sentinel", []) 19 | 20 | 21 | class WorkerProcess(Process, ABC): 22 | """Consumes jobs, produces results.""" 23 | 24 | TIMEOUT = 0.2 # MAGIC arbitrary, affects time to shutdown if not using sentinel 25 | 26 | def __init__(self, job_queue, result_queue, stop_event): 27 | """Initializer. 28 | 29 | Args: 30 | job_queue(Queue): input queue 31 | result_queue(Queue): output queue 32 | stop_event(Event): end if set 33 | """ 34 | 35 | super().__init__() 36 | self._logger = logging.getLogger(self.__class__.__name__) 37 | self.input = job_queue 38 | self.output = result_queue 39 | self.stop_event = stop_event 40 | 41 | @abstractmethod 42 | def execute_job(self, job): 43 | """Target function for the worker. 44 | 45 | Args: 46 | job (tuple): caller's container pulled from the job queue 47 | Returns: 48 | tuple: caller's container to be enqueued to the result queue 49 | """ 50 | 51 | pass 52 | 53 | def run(self): 54 | """Get jobs, put results. 55 | 56 | Returns if the stop event is set or the sentinel is received. 57 | """ 58 | 59 | job = None 60 | result = None 61 | while not self.stop_event.is_set(): 62 | 63 | if not self._send_result(result): 64 | continue 65 | else: 66 | result = None 67 | 68 | try: 69 | job = job or self.input.get(timeout=self.TIMEOUT) 70 | except queue.Empty: 71 | continue 72 | except KeyboardInterrupt: 73 | break 74 | else: 75 | if isinstance(job, Sentinel): 76 | self.input.put_nowait(job) 77 | break 78 | 79 | result = self.execute_job(job) 80 | job = None 81 | 82 | def _send_result(self, result): 83 | """True if the result was enqueued, True if 'result' is None.""" 84 | 85 | if result is None: 86 | return True 87 | 88 | success = False 89 | try: 90 | self.output.put(result, timeout=self.TIMEOUT) 91 | except queue.Full: 92 | self._logger.warning("output queue is full!") 93 | else: 94 | sucess = True 95 | 96 | return True 97 | 98 | -------------------------------------------------------------------------------- /tests/input_data/Test_Genes_NormMatrix.tsv: -------------------------------------------------------------------------------- 1 | Gene_Name Chromosome Feature Start Stop Strand Length 2 | maker-Fvb1-1-snap-gene-0.15 Fvb1-1 gene 41.0 2396.0 + 2356.0 3 | maker-Fvb1-1-augustus-gene-0.13 Fvb1-1 gene 5556.0 7978.0 - 2423.0 4 | maker-Fvb1-1-snap-gene-0.18 Fvb1-1 gene 8487.0 8797.0 - 311.0 5 | snap_masked-Fvb1-1-processed-gene-0.6 Fvb1-1 gene 9361.0 9658.0 + 298.0 6 | augustus_masked-Fvb1-1-processed-gene-0.4 Fvb1-1 gene 11127.0 11411.0 - 285.0 7 | maker-Fvb1-1-snap-gene-0.16 Fvb1-1 gene 84598.0 86703.0 + 2106.0 8 | maker-Fvb1-1-augustus-gene-3.19 Fvb1-1 gene 314397.0 317655.0 - 3259.0 9 | maker-Fvb1-1-snap-gene-3.20 Fvb1-1 gene 315831.0 317608.0 + 1778.0 10 | augustus_masked-Fvb1-1-processed-gene-3.0 Fvb1-1 gene 319026.0 320584.0 + 1559.0 11 | maker-Fvb1-1-augustus-gene-3.17 Fvb1-1 gene 356220.0 357714.0 + 1495.0 12 | maker-Fvb1-1-snap-gene-3.24 Fvb1-1 gene 363853.0 364215.0 - 363.0 13 | maker-Fvb1-1-snap-gene-3.26 Fvb1-1 gene 364783.0 372216.0 - 7434.0 14 | augustus_masked-Fvb1-1-processed-gene-3.8 Fvb1-1 gene 374758.0 376307.0 - 1550.0 15 | maker-Fvb1-1-augustus-gene-3.18 Fvb1-1 gene 386791.0 388690.0 + 1900.0 16 | maker-Fvb1-1-augustus-gene-4.22 Fvb1-1 gene 412453.0 424954.0 - 12502.0 17 | maker-Fvb1-1-augustus-gene-4.23 Fvb1-1 gene 426581.0 435854.0 - 9274.0 18 | snap_masked-Fvb1-1-processed-gene-4.8 Fvb1-1 gene 437432.0 440233.0 + 2802.0 19 | snap_masked-Fvb1-1-processed-gene-4.10 Fvb1-1 gene 441720.0 441908.0 + 189.0 20 | augustus_masked-Fvb1-1-processed-gene-4.1 Fvb1-1 gene 454008.0 454529.0 + 522.0 21 | maker-Fvb1-1-snap-gene-4.25 Fvb1-1 gene 454566.0 457877.0 + 3312.0 22 | augustus_masked-Fvb1-1-processed-gene-4.3 Fvb1-1 gene 457884.0 462352.0 + 4469.0 23 | maker-Fvb1-1-snap-gene-4.29 Fvb1-1 gene 462652.0 465952.0 - 3301.0 24 | maker-Fvb1-1-augustus-gene-5.10 Fvb1-1 gene 523960.0 528517.0 + 4558.0 25 | maker-Fvb1-1-augustus-gene-5.11 Fvb1-1 gene 558851.0 561043.0 - 2193.0 26 | snap_masked-Fvb1-1-processed-gene-5.7 Fvb1-1 gene 568624.0 574304.0 - 5681.0 27 | snap_masked-Fvb1-1-processed-gene-6.4 Fvb1-1 gene 598901.0 599104.0 + 204.0 28 | maker-Fvb1-1-augustus-gene-6.12 Fvb1-1 gene 599215.0 600680.0 + 1466.0 29 | maker-Fvb1-1-augustus-gene-6.15 Fvb1-1 gene 600967.0 602849.0 - 1883.0 30 | snap_masked-Fvb1-1-processed-gene-6.11 Fvb1-1 gene 618100.0 622575.0 - 4476.0 31 | maker-Fvb1-1-snap-gene-6.19 Fvb1-1 gene 641786.0 649783.0 + 7998.0 32 | maker-Fvb1-1-augustus-gene-6.14 Fvb1-1 gene 660932.0 662110.0 + 1179.0 33 | maker-Fvb1-1-snap-gene-7.16 Fvb1-1 gene 690039.0 694198.0 + 4160.0 34 | maker-Fvb1-1-snap-gene-7.17 Fvb1-1 gene 706484.0 718558.0 + 12075.0 35 | snap_masked-Fvb1-1-processed-gene-7.12 Fvb1-1 gene 719899.0 720210.0 - 312.0 36 | maker-Fvb1-1-augustus-gene-7.15 Fvb1-1 gene 723171.0 724243.0 + 1073.0 37 | snap_masked-Fvb1-1-processed-gene-8.8 Fvb1-1 gene 797822.0 799572.0 + 1751.0 38 | maker-Fvb1-1-augustus-gene-8.23 Fvb1-1 gene 799576.0 801525.0 - 1950.0 39 | snap_masked-Fvb1-1-processed-gene-8.11 Fvb1-1 gene 804213.0 804524.0 + 312.0 40 | snap_masked-Fvb1-1-processed-gene-8.16 Fvb1-1 gene 854341.0 858564.0 - 4224.0 41 | snap_masked-Fvb1-1-processed-gene-8.17 Fvb1-1 gene 858574.0 858837.0 - 264.0 42 | -------------------------------------------------------------------------------- /transposon/notes: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | DESIGN 4 | 5 | - refactor merge data to do multiple processes per chromosome 6 | - so it doesn't suck 7 | - allow rewrite of files (open and close) 8 | - allow bitmap to track completed jobs (so I know which jobs need to be done) 9 | - so density calculators don't suck 10 | """ 11 | 12 | for each chromosome 13 | process = new_process(chromosome) # BAD 14 | for each gene in process; 15 | for each entry in gene / TE / window ... 16 | calculate 17 | insert 18 | 19 | 20 | for each chromosome 21 | workers = make_workers_for_a_chromosome(chrom) 22 | 23 | # split 24 | # mega job is composed of little jobs 25 | jobs = make_jobs(gene_subset, te_subset, window_subset) # pseudo-split 26 | 27 | # merge 28 | # for each result from the jobs, combine in some way 29 | merge_worker = make_merger() # 30 | better_density_data = new_density_data() 31 | with merge_worker as my_merger: 32 | result = get_result() 33 | better_density_data.insert_result(result) 34 | 35 | 36 | do_merge 37 | receive result 38 | 39 | insert result 40 | 41 | transmit success/failure 42 | 43 | do_work 44 | receive job 45 | 46 | create_subset of density 47 | loop 48 | 49 | transmit job 50 | 51 | 52 | # USER STORIES 53 | 54 | ## really basic access 55 | 56 | I just need density for one thing....(window/direction/te-type) 57 | 58 | can we add a column to the gene data panda frame, 59 | that column would be a specify TE density value for window/direction/te-type 60 | we aren't limited to adding to a gene data frame 61 | 62 | give me TE density values for all genes (2Darray?), e.g.: 63 | 64 | for all genes 65 | for one TE type 66 | for one window 67 | for one direction 68 | 69 | 70 | 71 | class NewHotness: 72 | 73 | def __init__(self): 74 | pass 75 | 76 | 77 | def fixture_density(): 78 | 79 | 80 | def test_muhgenes(): 81 | """ """ 82 | 83 | density = NewHotness() 84 | density.muhgenes(args...) # what are these args? 85 | 86 | 87 | 88 | @dataclass 89 | class DensityAccessor(): 90 | genes 91 | density 92 | te_order (Order|SuperFamily) 93 | te_name (LTR...) 94 | direction (left|intra|right) 95 | window (100 000 000...) 96 | 97 | def muhdensity(): 98 | """Convenience function to get some values.""" 99 | return pandaframe() 100 | 101 | def muhgenes(): 102 | """Generator to produce the densities across the genes""" 103 | for gene in allthegenes: 104 | self.gene = gene 105 | yield self.mudensity() 106 | 107 | 108 | ## what the heck are all these chromosome files, I just want my genome! 109 | 110 | having all of the files split up by chromosome may make logical sense 111 | from a density calculation standpoint b/c each on is indepenent 112 | BUT it's annoying when making analysis 113 | 114 | can we make one H5 file to rule them all? 115 | yes 116 | maybe just have one layer of abstraction, index each chromosome 117 | maybe concatenate the data? need to maintain what chromosome it came from 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /tests/test_transposon.tsv: -------------------------------------------------------------------------------- 1 | Fvb1-1 RepeatMasker DNA/Mutator 3350 4133 5.0 + 4992 Fxa_V1_4_26930 2 | Fvb1-1 RepeatMasker LTR/Gypsy 4209 4873 11.7 - 2969 Fvb4-4:6485593..6486917_LTR 3 | Fvb1-1 RepeatMasker LTR/Gypsy 4871 5232 16.0 - 1185 Fvb4-4:6485593..6486917_LTR 4 | Fvb1-1 RepeatMasker LTR/unknown 15404 16346 19.1 - 3262 Fvb1-1:9463282..9465926_LTR 5 | Fvb1-1 RepeatMasker LTR/unknown 16347 20224 3.8 - 30004 Fvb3-2:23517681..23521581_INT-int 6 | Fvb1-1 RepeatMasker LTR/unknown 20225 22848 15.4 - 12678 Fvb1-1:9463282..9465926_LTR 7 | Fvb1-1 RepeatMasker LTR/Gypsy 23450 24094 20.3 - 2636 Fvb1-1:5519794..5522056_LTR 8 | Fvb1-1 RepeatMasker LTR/unknown 24133 24415 19.0 - 1082 Fvb1-3:25198782..25200749_LTR 9 | Fvb1-1 RepeatMasker DNA/CMC-EnSpm 24412 24910 8.5 + 2257 family-72 10 | Fvb1-1 RepeatMasker DNA/CMC-EnSpm 25024 25752 18.8 + 1671 family-72 11 | Fvb1-1 RepeatMasker DNA/CMC-EnSpm 25751 25840 21.4 + 351 family-72 12 | Fvb1-1 RepeatMasker LTR/unknown 25842 27393 5.9 + 10968 Fvb6-4:17811941..17814568_LTR 13 | Fvb1-1 RepeatMasker LTR/Gypsy 27398 27999 19.6 + 1680 Fvb4-2:14377692..14383914_LTR 14 | Fvb1-1 RepeatMasker LTR/Gypsy 28097 28708 21.1 + 1651 Fvb4-2:14377692..14383914_LTR 15 | Fvb1-1 RepeatMasker LTR/unknown 28709 29048 4.4 - 2681 Fvb6-3:14035592..14036822_LTR 16 | Fvb1-1 RepeatMasker LTR/unknown 29051 30034 5.9 - 7242 Fvb7-3:21810512..21811992_LTR 17 | Fvb1-1 RepeatMasker LTR/unknown 30038 30447 20.0 + 1455 Fvb1-2:27872851..27875385_LTR 18 | Fvb1-1 RepeatMasker LTR/Gypsy 30440 31593 15.4 - 5517 Fvb1-1:1901529..1903797_LTR 19 | Fvb1-1 RepeatMasker LTR/Gypsy 30986 31928 12.8 + 3981 Fvb1-2:23736536..23738711_LTR 20 | Fvb1-1 RepeatMasker LTR/Gypsy 32011 33581 7.7 + 11550 Fvb6-4:12909946..12918059_INT-int 21 | Fvb1-1 RepeatMasker LTR/unknown 33582 33661 11.2 - 524 Fvb4-1:5166560..5169132_LTR 22 | Fvb1-1 RepeatMasker LTR/unknown 33646 33720 17.3 + 409 Fvb1-3:21882267..21885047_LTR 23 | Fvb1-1 RepeatMasker LTR/Gypsy 33721 34100 11.8 + 2503 Fvb4-2:19625958..19630035_INT-int 24 | Fvb1-1 RepeatMasker LTR/Gypsy 34101 36264 20.8 + 7112 Fvb3-1:4132469..4134698_LTR 25 | Fvb1-1 RepeatMasker LTR/Gypsy 36265 38219 20.9 + 6733 Fvb5-1:12085737..12087980_LTR 26 | Fvb1-1 RepeatMasker LTR/Gypsy 38514 40380 19.2 + 6488 Fvb6-2:9966870..9969479_LTR 27 | Fvb1-1 RepeatMasker LTR/unknown 40383 41342 5.3 - 7215 Fvb1-2:20827613..20830221_LTR 28 | Fvb1-1 RepeatMasker LTR/Gypsy 41343 41699 12.5 - 7196 Fvb1-1:1901529..1903797_LTR 29 | Fvb1-1 RepeatMasker LTR/Gypsy 41469 42958 8.3 + 9764 Fvb1-4:19462751..19465717_LTR 30 | Fvb1-1 RepeatMasker LTR/unknown 42959 45118 8.4 + 14420 Fvb6-4:10839685..10850190_INT-int -------------------------------------------------------------------------------- /examples/Blueberry_Expression/src/import_blueberry_gene_anno.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filter a gene annotation file for the TE Density algorithm 3 | """ 4 | 5 | __author__ = "Scott Teresi" 6 | 7 | import pandas as pd 8 | import argparse 9 | import os 10 | import logging 11 | import coloredlogs 12 | 13 | 14 | def write_cleaned_genes(gene_pandaframe, output_dir, genome_name, logger): 15 | file_name = os.path.join(output_dir, ("Cleaned_" + genome_name + "_Genes.tsv")) 16 | 17 | logger.info("Writing cleaned gene file to: %s" % file_name) 18 | gene_pandaframe.to_csv(file_name, sep="\t", header=True, index=True) 19 | 20 | 21 | def import_genes(genes_input_path, logger): 22 | """Import genes file. 23 | 24 | Args: 25 | input_dir (command line argument) Specify the input directory of the gene 26 | annotation data, this is the same as the TE annotation directory 27 | """ 28 | 29 | col_names = [ 30 | "Chromosome", 31 | "Software", 32 | "Feature", 33 | "Start", 34 | "Stop", 35 | "Score", 36 | "Strand", 37 | "Frame", 38 | "FullName", 39 | ] 40 | 41 | col_to_use = [ 42 | "Chromosome", 43 | "Software", 44 | "Feature", 45 | "Start", 46 | "Stop", 47 | "Strand", 48 | "FullName", 49 | ] 50 | 51 | Gene_Data = pd.read_csv( 52 | genes_input_path, 53 | sep="\t+", 54 | header=None, 55 | engine="python", 56 | names=col_names, 57 | usecols=col_to_use, 58 | dtype={"Stop": "float64", "Start": "float64"}, 59 | comment="#", 60 | ) 61 | 62 | # rows in annotation 63 | Gene_Data = Gene_Data[Gene_Data.Feature == "gene"] # drop non-gene rows 64 | 65 | # clean the names and set as the index (get row wrt name c.f. idx) 66 | 67 | Gene_Data["Gene_Name"] = Gene_Data["FullName"].str.extract(r"ID=(.*?);") 68 | 69 | Gene_Data.set_index("Gene_Name", inplace=True) 70 | Gene_Data = Gene_Data.drop(columns=["FullName", "Software"]) 71 | 72 | Gene_Data.Strand = Gene_Data.Strand.astype(str) 73 | 74 | Gene_Data["Length"] = Gene_Data.Stop - Gene_Data.Start + 1 75 | 76 | Gene_Data.sort_values(by=["Chromosome", "Start"], inplace=True) 77 | # MAGIC I only want the first 48 chromosomes 78 | chromosomes_i_want = ["VaccDscaff" + str(i) for i in range(49)] 79 | Gene_Data = Gene_Data.loc[Gene_Data["Chromosome"].isin(chromosomes_i_want)] 80 | return Gene_Data 81 | 82 | 83 | if __name__ == "__main__": 84 | 85 | parser = argparse.ArgumentParser(description="Reformat gene annotation file") 86 | parser.add_argument( 87 | "gene_input_file", type=str, help="Parent path of gene annotation file" 88 | ) 89 | parser.add_argument( 90 | "output_dir", 91 | type=str, 92 | help="Parent directory to output results", 93 | ) 94 | parser.add_argument( 95 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 96 | ) 97 | 98 | args = parser.parse_args() 99 | args.gene_input_file = os.path.abspath(args.gene_input_file) 100 | args.output_dir = os.path.abspath(args.output_dir) 101 | 102 | log_level = logging.DEBUG if args.verbose else logging.INFO 103 | logger = logging.getLogger(__name__) 104 | coloredlogs.install(level=log_level) 105 | 106 | # Execute 107 | cleaned_genes = import_genes(args.gene_input_file, logger) 108 | write_cleaned_genes(cleaned_genes, args.output_dir, "Blueberry", logger) 109 | -------------------------------------------------------------------------------- /examples/Human/src/import_human_te_anno.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Reformat the human TE file into a format conducive to the Transposon_Data class 5 | """ 6 | 7 | __author__ = "Scott Teresi" 8 | 9 | import pandas as pd 10 | import argparse 11 | import logging 12 | import os 13 | import coloredlogs 14 | 15 | from examples.Human.src.replace_human_TE_names import te_annot_renamer 16 | 17 | 18 | def write_cleaned_TEs(te_pandaframe, output_dir, genome_name, logger): 19 | file_name = os.path.join( 20 | output_dir, ("Cleaned_Chr7_13_" + genome_name + "_TEs.tsv") 21 | ) 22 | 23 | logger.info("Writing cleaned TE file to: %s" % file_name) 24 | te_pandaframe.to_csv(file_name, sep="\t", header=True, index=False) 25 | 26 | 27 | def import_human_tes(human_TE_file, logger): 28 | """ 29 | We want a pandas object with Chromosome, Start, Stop, Strand, Order, SuperFamily and Length as columns 30 | """ 31 | data = pd.read_csv( 32 | human_TE_file, 33 | header="infer", 34 | sep="\t", 35 | dtype={ 36 | "genoStart": "float64", 37 | "genoEnd": "float64", 38 | "strand": str, 39 | "genoName": str, 40 | "repClass": str, 41 | "repFamily": str, 42 | }, 43 | ) 44 | data.drop( 45 | columns=[ 46 | "#bin", 47 | "swScore", 48 | "milliDiv", 49 | "milliIns", 50 | "milliDel", 51 | "id", 52 | "genoLeft", 53 | "repStart", 54 | "repEnd", 55 | "repLeft", 56 | "repName", 57 | ], 58 | inplace=True, 59 | ) 60 | data.rename( 61 | columns={ 62 | "genoName": "Chromosome", 63 | "repClass": "Order", 64 | "repFamily": "SuperFamily", 65 | "strand": "Strand", 66 | "genoStart": "Start", 67 | "genoEnd": "Stop", 68 | }, 69 | inplace=True, 70 | ) 71 | 72 | data["Length"] = data.Stop - data.Start + 1 73 | # NOTE only grabbing specific chromosomes 74 | chromosomes_i_want = ["chr7", "chr13"] # MAGIC 75 | data = data.loc[data["Chromosome"].isin(chromosomes_i_want)] 76 | 77 | data = te_annot_renamer(data) 78 | 79 | return data 80 | 81 | 82 | if __name__ == "__main__": 83 | """Command line interface to calculate density.""" 84 | 85 | parser = argparse.ArgumentParser(description="Reformat TE annotation file") 86 | path_main = os.path.abspath(__file__) 87 | dir_main = os.path.dirname(path_main) 88 | output_default = os.path.join( 89 | dir_main, "../../../../", "TE_Data/filtered_input_data" 90 | ) 91 | parser.add_argument( 92 | "TE_input_file", type=str, help="Parent path of TE annotation file" 93 | ) 94 | 95 | parser.add_argument( 96 | "--output_dir", 97 | "-o", 98 | type=str, 99 | default=output_default, 100 | help="Parent directory to output results", 101 | ) 102 | 103 | parser.add_argument( 104 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 105 | ) 106 | 107 | args = parser.parse_args() 108 | args.TE_input_file = os.path.abspath(args.TE_input_file) 109 | args.output_dir = os.path.abspath(args.output_dir) 110 | 111 | log_level = logging.DEBUG if args.verbose else logging.INFO 112 | logger = logging.getLogger(__name__) 113 | coloredlogs.install(level=log_level) 114 | 115 | cleaned_tes = import_human_tes(args.TE_input_file, logger) 116 | write_cleaned_TEs(cleaned_tes, args.output_dir, "Human", logger) 117 | -------------------------------------------------------------------------------- /tests/unit/test_import_genes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Unit test GeneDatum 5 | """ 6 | 7 | __author__ = "Scott Teresi" 8 | 9 | import os 10 | import pytest 11 | import numpy as np 12 | import pandas as pd 13 | from io import StringIO 14 | 15 | 16 | col_names = [ 17 | "Chromosome", 18 | "Software", 19 | "Feature", 20 | "Start", 21 | "Stop", 22 | "Score", 23 | "Strand", 24 | "Frame", 25 | "FullName", 26 | ] 27 | 28 | col_to_use = [ 29 | "Chromosome", 30 | "Software", 31 | "Feature", 32 | "Start", 33 | "Stop", 34 | "Strand", 35 | "FullName", 36 | ] 37 | 38 | gene_anno_path = "tests/input_data/Test_Gene_Anno_Float_Conversion.tsv" 39 | 40 | 41 | @pytest.fixture 42 | def import_as_float32(): 43 | gene_anno = pd.read_csv( 44 | gene_anno_path, 45 | sep="\t+", 46 | header=None, 47 | engine="python", 48 | names=col_names, 49 | usecols=col_to_use, 50 | dtype={"Start": "float32", "Stop": "float32"}, 51 | comment="#", 52 | ) 53 | gene_anno = gene_anno[gene_anno.Feature == "gene"] # drop non-gene rows 54 | # clean the names and set as the index (get row wrt name c.f. idx) 55 | gene_anno["Gene_Name"] = gene_anno["FullName"].str.extract(r"ID=(.*?);") 56 | gene_anno.set_index("Gene_Name", inplace=True) 57 | gene_anno = gene_anno.drop(columns=["FullName", "Software"]) 58 | gene_anno.Strand = gene_anno.Strand.astype(str) 59 | gene_anno["Length"] = gene_anno.Stop - gene_anno.Start + 1 60 | return gene_anno 61 | 62 | 63 | @pytest.fixture 64 | def import_as_float64(): 65 | gene_anno = pd.read_csv( 66 | gene_anno_path, 67 | sep="\t+", 68 | header=None, 69 | engine="python", 70 | names=col_names, 71 | usecols=col_to_use, 72 | dtype={"Start": "float64", "Stop": "float64"}, 73 | comment="#", 74 | ) 75 | gene_anno = gene_anno[gene_anno.Feature == "gene"] # drop non-gene rows 76 | # clean the names and set as the index (get row wrt name c.f. idx) 77 | gene_anno["Gene_Name"] = gene_anno["FullName"].str.extract(r"ID=(.*?);") 78 | gene_anno.set_index("Gene_Name", inplace=True) 79 | gene_anno = gene_anno.drop(columns=["FullName", "Software"]) 80 | gene_anno.Strand = gene_anno.Strand.astype(str) 81 | gene_anno["Length"] = gene_anno.Stop - gene_anno.Start + 1 82 | return gene_anno 83 | 84 | 85 | true_start_list = [ 86 | 41.0, 87 | 5556.0, 88 | 8487.0, 89 | 9361.0, 90 | 11127.0, 91 | 84598.0, 92 | 117287120.0, 93 | 118974314397.0, 94 | 22456307315831.0, 95 | 88877765432319026.0, 96 | ] 97 | true_stop_list = 2 98 | 99 | 100 | def test_small_numbers_to_float32(import_as_float32): 101 | """ 102 | Using float64 the import script produce the right start and stop values when 103 | importing a gene annotation with small initial values? 104 | """ 105 | assert import_as_float32.Start.to_list()[0:5] == true_start_list[0:5] 106 | 107 | 108 | def test_large_numbers_to_float32(import_as_float32): 109 | """ 110 | Using float64 the import script produce the right start and stop values when 111 | importing a gene annotation with large initial values? 112 | """ 113 | with pytest.raises(AssertionError): 114 | assert import_as_float32.Start.to_list()[4:] == true_start_list[4:] 115 | 116 | 117 | def test_large_numbers_to_float64(import_as_float64): 118 | """ 119 | Using float64 the import script produce the right start and stop values when 120 | importing a gene annotation with large initial values? 121 | """ 122 | assert import_as_float64.Start.to_list()[4:] == true_start_list[4:] 123 | 124 | 125 | if __name__ == "__main__": 126 | pytest.main(["-svv", __file__]) # for convenience 127 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/fix_cds_names.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Reformat CDS Fasta files for EDTA usage 5 | """ 6 | 7 | __author__ = "Scott Teresi" 8 | 9 | import argparse 10 | import os 11 | import logging 12 | import coloredlogs 13 | from Bio import SeqIO 14 | 15 | 16 | def reformat_seq_iq(input_fasta, genome_name, output_dir, logger): 17 | """ 18 | Reformat a CDS FASTA file to have shorter sequence ID names for EDTA 19 | 20 | Args: 21 | input_fasta (str): String path to input fasta file 22 | 23 | genome_name (str): String for genome name 24 | 25 | output_dir (str): Path to output dir 26 | 27 | logger (logging.Logger): Object to log information to 28 | 29 | Returns: 30 | None, just saves the edited FASTA file to disk. Also writes a 31 | conversion table to disk for the old names and their new name 32 | counterparts 33 | """ 34 | # MAGIC file suffixes 35 | new_fasta = os.path.join(output_dir, (genome_name + "_CDS_NewNames.fasta")) 36 | name_key = os.path.join(output_dir, (genome_name + "_CDS_Seq_ID_Conversion.txt")) 37 | 38 | if os.path.exists(new_fasta): 39 | os.remove(new_fasta) # remove the file because we are in append mode 40 | if os.path.exists(name_key): 41 | os.remove(name_key) 42 | pair_dict = {} # NB this is used to write the conversion key later for 43 | # clarity 44 | with open(input_fasta, "r") as input_fasta: 45 | for s_record in SeqIO.parse(input_fasta, "fasta"): 46 | # NB the s_record.id and s_record.description combined contain 47 | # all the information for each entry following the '>' character 48 | # in the fasta 49 | 50 | # NB In this case: 51 | # We just want the s_record.id which correctly points to the 52 | # first integer. E.g '1 dna:chromosome blah blah blah' we just want 53 | # 1. 54 | s_record.id = s_record.id.replace("transcript:", "") 55 | pair_dict[s_record.id] = s_record.id + " " + s_record.description 56 | s_record.description = "" # NB edit the description so that when 57 | # we rewrite we don't have the extraneous info 58 | with open(new_fasta, "a") as output: 59 | SeqIO.write(s_record, output, "fasta") 60 | logger.info( 61 | "Finished writing new fasta to: %s" % os.path.join(output_dir, new_fasta) 62 | ) 63 | 64 | with open(name_key, "w") as output: 65 | for key, val in pair_dict.items(): 66 | # Write the conversion table for record-keeping. 67 | output.write("%s\t%s\n" % (key, val)) 68 | logger.info( 69 | "Finished writing name conversion table to: %s" 70 | % os.path.join(output_dir, name_key) 71 | ) 72 | 73 | 74 | if __name__ == "__main__": 75 | 76 | path_main = os.path.abspath(__file__) 77 | dir_main = os.path.dirname(path_main) 78 | parser = argparse.ArgumentParser(description="Reformat FASTA for EDTA") 79 | 80 | parser.add_argument("fasta_input_file", type=str, help="parent path of fasta file") 81 | parser.add_argument("genome_id", type=str, help="name of genome") 82 | output_default = os.path.join( 83 | dir_main, "../../../../", "TE_Density_Example_Data/Rice" 84 | ) 85 | parser.add_argument( 86 | "--output_dir", 87 | "-o", 88 | type=str, 89 | default=output_default, 90 | help="Parent directory to output results", 91 | ) 92 | parser.add_argument( 93 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 94 | ) 95 | args = parser.parse_args() 96 | args.fasta_input_file = os.path.abspath(args.fasta_input_file) 97 | args.output_dir = os.path.join(args.output_dir, args.genome_id, "Sequences") 98 | 99 | log_level = logging.DEBUG if args.verbose else logging.INFO 100 | logger = logging.getLogger(__name__) 101 | coloredlogs.install(level=log_level) 102 | 103 | reformat_seq_iq(args.fasta_input_file, args.genome_id, args.output_dir, logger) 104 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/replace_names_rice.py: -------------------------------------------------------------------------------- 1 | def te_annot_renamer(TE_Data): 2 | U = "Unknown_Order" 3 | master_order = { 4 | "Unknown": U, 5 | "MITE": "TIR", 6 | "pararetrovirus": "pararetrovirus", 7 | "DNA": "TIR", 8 | } 9 | 10 | U = "Unknown_Superfam" 11 | master_superfamily = { 12 | # EDTA/Wicker et al 2007 renames to common name: 13 | "RLC": "Copia", 14 | "RLG": "Gypsy", 15 | "RLB": "Bel_Pao", 16 | "RLR": "Retrovirus", 17 | "RLE": "ERV", 18 | "RYD": "DIRS", 19 | "RYN": "Ngaro", 20 | "RYV": "VIPER", 21 | "RPP": "Penelope", 22 | "RIR": "R2", 23 | "RIT": "RTE", 24 | "RIJ": "Jockey", 25 | "RIL": "L1", 26 | "RII": "I", 27 | "RST": "tRNA", 28 | "RSL": "7SL", 29 | "RSS": "5S", 30 | "DTT": "Tc1-Mariner", 31 | "DTA": "hAT", 32 | "DTM": "Mutator", 33 | "DTE": "Merlin", 34 | "DTR": "Transib", 35 | "DTP": "P", 36 | "DTB": "PiggyBac", 37 | "DTH": "PIF-Harbinger", 38 | "DTC": "CACTA", 39 | "DYC": "Crypton", 40 | "DHH": "Helitron", 41 | "DMM": "Maverick", 42 | # Custom changes 43 | "unknown": U, 44 | "Unknown": U, 45 | "None": U, 46 | "EnSpm_CACTA": "CACTA", 47 | "MuDR_Mutator": "Mutator", 48 | "PIF_Harbinger": "PIF-Harbinger", 49 | } 50 | 51 | TE_Data.SuperFamily.fillna( 52 | value="Unknown_Superfam", inplace=True 53 | ) # replace None w U 54 | 55 | # Invoke dictionary to fix names 56 | TE_Data.Order.replace(master_order, inplace=True) 57 | TE_Data.SuperFamily.replace(master_superfamily, inplace=True) 58 | 59 | # Rename the superfamily value for pararetros as pararetrovirus 60 | TE_Data.loc[TE_Data.Order == "pararetrovirus", "SuperFamily"] = "pararetrovirus" 61 | 62 | # Rename unknown LINE element superfamilies to Unknown_LINE_Superfam to 63 | # distinguish between other unknowns 64 | TE_Data.loc[ 65 | (TE_Data.Order == "LINE") & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 66 | "SuperFamily", 67 | ] = "Unknown_LINE_Superfam" 68 | 69 | # Rename unknown LTR element superfamilies to Unknown_LTR_Superfam to 70 | # distinguish between other unknowns 71 | TE_Data.loc[ 72 | (TE_Data["Order"] == "LTR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 73 | "SuperFamily", 74 | ] = "Unknown_LTR_Superfam" 75 | 76 | # Rename unknown TIR element superfamilies to Unknown_TIR_Superfam to 77 | # distinguish between other unknowns 78 | TE_Data.loc[ 79 | (TE_Data.Order == "TIR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 80 | "SuperFamily", 81 | ] = "Unknown_TIR_Superfam" 82 | 83 | # Rename both values for Helitron elements, so that 'Helitron' is 84 | # both the Order and SuperFamily value 85 | # Some Helitron elements were labeled 'DNA' in the Order location, this is 86 | # technically correct but I prefer to differentiate the TIR DNA elements 87 | # from DNA elements as a whole 88 | TE_Data.loc[ 89 | (TE_Data["Order"] == "TIR") & (TE_Data["SuperFamily"] == "Helitron"), 90 | ["Order", "SuperFamily"], 91 | ] = "Helitron" 92 | # If the Order is Helitron and the SuperFamily is unknown make the 93 | # superfamily 'Helitron' 94 | TE_Data.loc[ 95 | (TE_Data["Order"] == "Helitron") 96 | & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 97 | "SuperFamily", 98 | ] = "Helitron" 99 | 100 | # For TEs that are unknown for both Order AND SuperFamily we will call 101 | # those 'Completely_Unknown' 102 | TE_Data.loc[ 103 | (TE_Data["Order"] == "Unknown_Order") 104 | & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 105 | ["Order", "SuperFamily"], 106 | ] = "Completely_Unknown" 107 | 108 | return TE_Data 109 | 110 | 111 | def diagnostic_cleaner_helper(TE_Data): 112 | print() 113 | print(TE_Data.Order.unique()) 114 | print(TE_Data.SuperFamily.unique()) 115 | print() 116 | 117 | # To see unique for a given type: 118 | # print(TE_Data.loc[TE_Data['Order'] == 'LINE'].SuperFamily.unique()) 119 | return None 120 | -------------------------------------------------------------------------------- /examples/Arabidopsis/src/replace_names_Arabidopsis.py: -------------------------------------------------------------------------------- 1 | def te_annot_renamer(TE_Data): 2 | U = "Unknown_Order" 3 | master_order = { 4 | "Unknown": U, 5 | "MITE": "TIR", 6 | "pararetrovirus": "pararetrovirus", 7 | "DNA": "TIR", 8 | } 9 | 10 | U = "Unknown_Superfam" 11 | master_superfamily = { 12 | # EDTA/Wicker et al 2007 renames to common name: 13 | "RLC": "Copia", 14 | "RLG": "Gypsy", 15 | "RLB": "Bel_Pao", 16 | "RLR": "Retrovirus", 17 | "RLE": "ERV", 18 | "RYD": "DIRS", 19 | "RYN": "Ngaro", 20 | "RYV": "VIPER", 21 | "RPP": "Penelope", 22 | "RIR": "R2", 23 | "RIT": "RTE", 24 | "RIJ": "Jockey", 25 | "RIL": "L1", 26 | "RII": "I", 27 | "RST": "tRNA", 28 | "RSL": "7SL", 29 | "RSS": "5S", 30 | "DTT": "Tc1-Mariner", 31 | "DTA": "hAT", 32 | "DTM": "Mutator", 33 | "DTE": "Merlin", 34 | "DTR": "Transib", 35 | "DTP": "P", 36 | "DTB": "PiggyBac", 37 | "DTH": "PIF-Harbinger", 38 | "DTC": "CACTA", 39 | "DYC": "Crypton", 40 | "DHH": "Helitron", 41 | "DMM": "Maverick", 42 | # Custom changes 43 | "unknown": U, 44 | "Unknown": U, 45 | "None": U, 46 | "EnSpm_CACTA": "CACTA", 47 | "MuDR_Mutator": "Mutator", 48 | "PIF_Harbinger": "PIF-Harbinger", 49 | } 50 | 51 | TE_Data.SuperFamily.fillna( 52 | value="Unknown_Superfam", inplace=True 53 | ) # replace None w U 54 | 55 | # Invoke dictionary to fix names 56 | TE_Data.Order.replace(master_order, inplace=True) 57 | TE_Data.SuperFamily.replace(master_superfamily, inplace=True) 58 | 59 | # Rename the superfamily value for pararetros as pararetrovirus 60 | TE_Data.loc[TE_Data.Order == "pararetrovirus", "SuperFamily"] = "pararetrovirus" 61 | 62 | # Rename unknown LINE element superfamilies to Unknown_LINE_Superfam to 63 | # distinguish between other unknowns 64 | TE_Data.loc[ 65 | (TE_Data.Order == "LINE") & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 66 | "SuperFamily", 67 | ] = "Unknown_LINE_Superfam" 68 | 69 | # Rename unknown LTR element superfamilies to Unknown_LTR_Superfam to 70 | # distinguish between other unknowns 71 | TE_Data.loc[ 72 | (TE_Data["Order"] == "LTR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 73 | "SuperFamily", 74 | ] = "Unknown_LTR_Superfam" 75 | 76 | # Rename unknown TIR element superfamilies to Unknown_TIR_Superfam to 77 | # distinguish between other unknowns 78 | TE_Data.loc[ 79 | (TE_Data.Order == "TIR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 80 | "SuperFamily", 81 | ] = "Unknown_TIR_Superfam" 82 | 83 | # Rename both values for Helitron elements, so that 'Helitron' is 84 | # both the Order and SuperFamily value 85 | # Some Helitron elements were labeled 'DNA' in the Order location, this is 86 | # technically correct but I prefer to differentiate the TIR DNA elements 87 | # from DNA elements as a whole 88 | TE_Data.loc[ 89 | (TE_Data["Order"] == "TIR") & (TE_Data["SuperFamily"] == "Helitron"), 90 | ["Order", "SuperFamily"], 91 | ] = "Helitron" 92 | # If the Order is Helitron and the SuperFamily is unknown make the 93 | # superfamily 'Helitron' 94 | TE_Data.loc[ 95 | (TE_Data["Order"] == "Helitron") 96 | & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 97 | "SuperFamily", 98 | ] = "Helitron" 99 | 100 | # For TEs that are unknown for both Order AND SuperFamily we will call 101 | # those 'Completely_Unknown' 102 | TE_Data.loc[ 103 | (TE_Data["Order"] == "Unknown_Order") 104 | & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 105 | ["Order", "SuperFamily"], 106 | ] = "Completely_Unknown" 107 | 108 | return TE_Data 109 | 110 | 111 | def diagnostic_cleaner_helper(TE_Data): 112 | print() 113 | print(TE_Data.Order.unique()) 114 | print(TE_Data.SuperFamily.unique()) 115 | print() 116 | 117 | # To see unique for a given type: 118 | # print(TE_Data.loc[TE_Data['Order'] == 'LINE'].SuperFamily.unique()) 119 | return None 120 | -------------------------------------------------------------------------------- /examples/Blueberry_Expression/src/replace_names_blueberry.py: -------------------------------------------------------------------------------- 1 | def te_annot_renamer(TE_Data): 2 | U = "Unknown_Order" 3 | master_order = { 4 | "Unknown": U, 5 | "MITE": "TIR", 6 | "pararetrovirus": "pararetrovirus", 7 | "DNA": "TIR", 8 | } 9 | 10 | U = "Unknown_Superfam" 11 | master_superfamily = { 12 | # EDTA/Wicker et al 2007 renames to common name: 13 | "RLC": "Copia", 14 | "RLG": "Gypsy", 15 | "RLB": "Bel_Pao", 16 | "RLR": "Retrovirus", 17 | "RLE": "ERV", 18 | "RYD": "DIRS", 19 | "RYN": "Ngaro", 20 | "RYV": "VIPER", 21 | "RPP": "Penelope", 22 | "RIR": "R2", 23 | "RIT": "RTE", 24 | "RIJ": "Jockey", 25 | "RIL": "L1", 26 | "RII": "I", 27 | "RST": "tRNA", 28 | "RSL": "7SL", 29 | "RSS": "5S", 30 | "DTT": "Tc1-Mariner", 31 | "DTA": "hAT", 32 | "DTM": "Mutator", 33 | "DTE": "Merlin", 34 | "DTR": "Transib", 35 | "DTP": "P", 36 | "DTB": "PiggyBac", 37 | "DTH": "PIF-Harbinger", 38 | "DTC": "CACTA", 39 | "DYC": "Crypton", 40 | "DHH": "Helitron", 41 | "DMM": "Maverick", 42 | # Custom changes 43 | "unknown": U, 44 | "Unknown": U, 45 | "None": U, 46 | "EnSpm_CACTA": "CACTA", 47 | "MuDR_Mutator": "Mutator", 48 | "PIF_Harbinger": "PIF-Harbinger", 49 | } 50 | 51 | TE_Data.SuperFamily.fillna( 52 | value="Unknown_Superfam", inplace=True 53 | ) # replace None w U 54 | 55 | # Invoke dictionary to fix names 56 | TE_Data.Order.replace(master_order, inplace=True) 57 | TE_Data.SuperFamily.replace(master_superfamily, inplace=True) 58 | 59 | # Rename the superfamily value for pararetros as pararetrovirus 60 | TE_Data.loc[TE_Data.Order == "pararetrovirus", "SuperFamily"] = "pararetrovirus" 61 | 62 | # Rename unknown LINE element superfamilies to Unknown_LINE_Superfam to 63 | # distinguish between other unknowns 64 | TE_Data.loc[ 65 | (TE_Data.Order == "LINE") & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 66 | "SuperFamily", 67 | ] = "Unknown_LINE_Superfam" 68 | 69 | # Rename unknown LTR element superfamilies to Unknown_LTR_Superfam to 70 | # distinguish between other unknowns 71 | TE_Data.loc[ 72 | (TE_Data["Order"] == "LTR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 73 | "SuperFamily", 74 | ] = "Unknown_LTR_Superfam" 75 | 76 | # Rename unknown TIR element superfamilies to Unknown_TIR_Superfam to 77 | # distinguish between other unknowns 78 | TE_Data.loc[ 79 | (TE_Data.Order == "TIR") & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 80 | "SuperFamily", 81 | ] = "Unknown_TIR_Superfam" 82 | 83 | # Rename both values for Helitron elements, so that 'Helitron' is 84 | # both the Order and SuperFamily value 85 | # Some Helitron elements were labeled 'DNA' in the Order location, this is 86 | # technically correct but I prefer to differentiate the TIR DNA elements 87 | # from DNA elements as a whole 88 | TE_Data.loc[ 89 | (TE_Data["Order"] == "TIR") & (TE_Data["SuperFamily"] == "Helitron"), 90 | ["Order", "SuperFamily"], 91 | ] = "Helitron" 92 | # If the Order is Helitron and the SuperFamily is unknown make the 93 | # superfamily 'Helitron' 94 | TE_Data.loc[ 95 | (TE_Data["Order"] == "Helitron") 96 | & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 97 | "SuperFamily", 98 | ] = "Helitron" 99 | 100 | # For TEs that are unknown for both Order AND SuperFamily we will call 101 | # those 'Completely_Unknown' 102 | TE_Data.loc[ 103 | (TE_Data["Order"] == "Unknown_Order") 104 | & (TE_Data["SuperFamily"] == "Unknown_Superfam"), 105 | ["Order", "SuperFamily"], 106 | ] = "Completely_Unknown" 107 | 108 | return TE_Data 109 | 110 | 111 | def diagnostic_cleaner_helper(TE_Data): 112 | print() 113 | print(TE_Data.Order.unique()) 114 | print(TE_Data.SuperFamily.unique()) 115 | print() 116 | 117 | # To see unique for a given type: 118 | # print(TE_Data.loc[TE_Data['Order'] == 'LINE'].SuperFamily.unique()) 119 | return None 120 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/import_syntelogs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __author__ = "Scott Teresi" 4 | 5 | import pandas as pd 6 | 7 | 8 | def import_syntelogs(syntelog_input_file): 9 | """ 10 | Import the syntelogs from the raw file and manage data filtration 11 | """ 12 | 13 | col_names = [ 14 | "OrgA_Chromosome", 15 | "OrgA_Gene_Region", 16 | "OrgA_Start", 17 | "OrgA_Stop", 18 | "OrgB_Chromosome", 19 | "OrgB_Gene_Region", 20 | "OrgB_Start", 21 | "OrgB_Stop", 22 | "E_Value", 23 | "Diagonal_Score", 24 | "Web_Link", 25 | ] 26 | 27 | col_to_use = [ 28 | "OrgA_Chromosome", 29 | "OrgA_Gene_Region", 30 | "OrgB_Chromosome", 31 | "OrgB_Gene_Region", 32 | "E_Value", 33 | "Diagonal_Score", 34 | ] 35 | 36 | syntelog_pandaframe = pd.read_csv( 37 | syntelog_input_file, 38 | sep="\t+", 39 | header=None, 40 | engine="python", 41 | names=col_names, 42 | usecols=col_to_use, 43 | comment="#", 44 | dtype={ 45 | "OrgA_Chromosome": str, 46 | "OrgA_Gene_Region": str, 47 | "OrgB_Chromosome": str, 48 | "OrgB_Gene_Region": str, 49 | "E_Value": "float64", 50 | "Diagonal_Score": "int32", 51 | }, 52 | ) 53 | 54 | # Get the correct name for the genes 55 | # MAGIC to split the name correctly 56 | syntelog_pandaframe["OrgA_Gene_Region"] = ( 57 | syntelog_pandaframe["OrgA_Gene_Region"].str.split("\|\|").str[3] 58 | ) 59 | syntelog_pandaframe["OrgB_Gene_Region"] = ( 60 | syntelog_pandaframe["OrgB_Gene_Region"].str.split("\|\|").str[3] 61 | ) 62 | 63 | # Remove rows that have transcript in the name because not worth dealing 64 | # with for example 65 | syntelog_pandaframe = syntelog_pandaframe[ 66 | ~syntelog_pandaframe["OrgA_Gene_Region"].str.contains("transcript") 67 | ] 68 | syntelog_pandaframe = syntelog_pandaframe[ 69 | ~syntelog_pandaframe["OrgB_Gene_Region"].str.contains("transcript") 70 | ] 71 | 72 | # Get the correct name for the gene names 73 | # MAGIC to split the name correctly 74 | syntelog_pandaframe["OrgA_Gene_Region"] = ( 75 | syntelog_pandaframe["OrgA_Gene_Region"].str.split("CDS:").str[1] 76 | ) 77 | syntelog_pandaframe["OrgB_Gene_Region"] = ( 78 | syntelog_pandaframe["OrgB_Gene_Region"].str.split("CDS:").str[1] 79 | ) 80 | 81 | syntelog_pandaframe["OrgA_Gene_Region"] = ( 82 | syntelog_pandaframe["OrgA_Gene_Region"].str.split(".").str[0] 83 | ) 84 | syntelog_pandaframe["OrgB_Gene_Region"] = ( 85 | syntelog_pandaframe["OrgB_Gene_Region"].str.split("-").str[0] 86 | ) 87 | 88 | # SynMap returns the transcript name for Sativa which can have slight 89 | # differences with the gene name, namely the letter g is replaced with 90 | # the letter t. 91 | # NB fix to get the gene name 92 | syntelog_pandaframe["OrgB_Gene_Region"] = syntelog_pandaframe[ 93 | "OrgB_Gene_Region" 94 | ].str.replace("t", "g") 95 | 96 | # Get the correct name for the chromosome 97 | # MAGIC 98 | syntelog_pandaframe["OrgA_Chromosome"] = ( 99 | syntelog_pandaframe["OrgA_Chromosome"].str.split("_").str[1] 100 | ) 101 | syntelog_pandaframe["OrgB_Chromosome"] = ( 102 | syntelog_pandaframe["OrgB_Chromosome"].str.split("_").str[1] 103 | ) 104 | 105 | # This step is important, it could differ if your data input is different. 106 | syntelog_pandaframe.rename( 107 | columns={"OrgA_Gene_Region": "Glaberrima", "OrgB_Gene_Region": "Sativa"}, 108 | inplace=True, 109 | ) 110 | # Trim E-values less than 0.05 111 | # MAGIC 112 | syntelog_pandaframe = syntelog_pandaframe.loc[syntelog_pandaframe["E_Value"] < 0.05] 113 | 114 | syntelog_pandaframe.drop( 115 | columns=["Diagonal_Score"], 116 | inplace=True, 117 | ) 118 | 119 | # I only want pairs where the chromosomes are equal 120 | syntelog_pandaframe = syntelog_pandaframe.loc[ 121 | syntelog_pandaframe["OrgA_Chromosome"] == syntelog_pandaframe["OrgB_Chromosome"] 122 | ] 123 | 124 | chromosome_list = [str(i) for i in range(1, 12 + 1)] 125 | syntelog_pandaframe = syntelog_pandaframe.loc[ 126 | syntelog_pandaframe["OrgA_Chromosome"].isin(chromosome_list) 127 | ] 128 | 129 | return syntelog_pandaframe 130 | -------------------------------------------------------------------------------- /examples/Human/src/retrieve_info_of_genes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | 3 | """ 4 | Retrieve info of a list of genes 5 | """ 6 | 7 | __author__ = "Scott Teresi" 8 | 9 | import argparse 10 | import os 11 | import logging 12 | import coloredlogs 13 | 14 | from transposon.gene_data import GeneData 15 | from transposon.density_data import DensityData 16 | from transposon.import_filtered_genes import import_filtered_genes 17 | 18 | 19 | if __name__ == "__main__": 20 | path_main = os.path.abspath(__file__) 21 | dir_main = os.path.dirname(path_main) 22 | parser = argparse.ArgumentParser( 23 | description="""output to std_out information of each gene in a 24 | user-provided list of genes using the DensityData.info_of_gene() 25 | function""" 26 | ) 27 | 28 | parser.add_argument( 29 | "te_density_hdf5_result", 30 | type=str, 31 | help="""Path to the HDF5 file that contains the genes that the user wants 32 | to extract TE density data from""", 33 | ) 34 | 35 | parser.add_argument( 36 | "cleaned_gene_data_file", 37 | type=str, 38 | help="""Path to the cleaned gene data 39 | file that was produced prior to running the pipeline, this is necessary 40 | to initialize the DensityData obj""", 41 | ) 42 | 43 | parser.add_argument( 44 | "chromosome_to_subset_gene_data", 45 | type=str, 46 | help="""The cleaned gene data 47 | file may contain information of genes from multiple chromosomes, 48 | because the TE Density data corresponds to one chromosome, please 49 | specify the appropriate chromosome identifier for the density data you 50 | are trying to access, so that we may appropriately subset the gene 51 | data""", 52 | ) 53 | 54 | parser.add_argument( 55 | "genome_id", 56 | type=str, 57 | help="""Please specify the genome ID, use the same one you used as an 58 | argument when running TE Density""", 59 | ) 60 | 61 | parser.add_argument( 62 | "list_of_genes", 63 | type=str, 64 | help="Path to list of genes files", 65 | ) 66 | 67 | parser.add_argument( 68 | "--window_idx", 69 | type=int, 70 | default=1, 71 | help="""Index of the window that you want to access information from, 72 | windows go from lowest to highest, default TE density settings yield 20 73 | windows (500, 10000, 500) (start, stop, step). Stop is inclusive""", 74 | ) 75 | 76 | parser.add_argument( 77 | "--n_te_types", 78 | type=int, 79 | default=5, 80 | help="""Number of TE types that you want to display when showing the 81 | top and bottom TE categories for density relative to your gene of 82 | interest""", 83 | ) 84 | 85 | parser.add_argument( 86 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 87 | ) 88 | 89 | args = parser.parse_args() 90 | args.te_density_hdf5_result = os.path.abspath(args.te_density_hdf5_result) 91 | args.cleaned_gene_data_file = os.path.abspath(args.cleaned_gene_data_file) 92 | args.list_of_genes = os.path.abspath(args.list_of_genes) 93 | log_level = logging.DEBUG if args.verbose else logging.INFO 94 | logger = logging.getLogger(__name__) 95 | coloredlogs.install(level=log_level) 96 | 97 | # Read pandas dataframe from cleaned genes file 98 | full_genome_gene_data = import_filtered_genes(args.cleaned_gene_data_file, logger) 99 | 100 | # MAGIC, chromosome category is inherent to the pandas data frame 101 | full_genome_gene_data_subsetted = full_genome_gene_data.loc[ 102 | full_genome_gene_data["Chromosome"] == args.chromosome_to_subset_gene_data 103 | ] 104 | 105 | # Initialize GeneData obj 106 | specific_chromosome_gene_data = GeneData( 107 | full_genome_gene_data_subsetted, args.genome_id 108 | ) 109 | 110 | # Initialize DensityData obj 111 | processed_density_data = DensityData.verify_h5_cache( 112 | args.te_density_hdf5_result, specific_chromosome_gene_data, logger 113 | ) 114 | 115 | # Read list of genes file from user and begin printing output to console 116 | with open(args.list_of_genes, "r", encoding="utf-8") as in_file: 117 | all_genes = [gene.strip() for gene in in_file] 118 | for gene in all_genes: 119 | print( 120 | processed_density_data.info_of_gene(gene, args.window_idx, args.n_te_types) 121 | ) 122 | -------------------------------------------------------------------------------- /transposon/replace_names.py: -------------------------------------------------------------------------------- 1 | # TODO candidate for deletion 2 | 3 | 4 | def te_annot_renamer(TE_Data): 5 | U = "Unknown_Order" 6 | master_order = { 7 | # Custom changes 8 | ## RepeatMasker-based Changes 9 | "unknown": U, 10 | "Unknown": U, 11 | "MITE": "DNA", 12 | "RC?": "DNA", 13 | "RC": "DNA", 14 | "SINE?": U, 15 | "tandem": "Tandem", 16 | "No_hits": U, 17 | ## EDTA-based Changes 18 | "pararetrovirus": "LTR", 19 | "mixture": "Mixture", 20 | "DNA": "TIR", 21 | } 22 | 23 | U = "Unknown_SuperFam" 24 | master_superfamily = { 25 | # EDTA/Wicker et al 2007 renames to common name: 26 | "RLC": "Copia", 27 | "RLG": "Gypsy", 28 | "RLB": "Bel_Pao", 29 | "RLR": "Retrovirus", 30 | "RLE": "ERV", 31 | "RYD": "DIRS", 32 | "RYN": "Ngaro", 33 | "RYV": "VIPER", 34 | "RPP": "Penelope", 35 | "RIR": "R2", 36 | "RIT": "RTE", 37 | "RIJ": "Jockey", 38 | "RIL": "L1", 39 | "RII": "I", 40 | "RST": "tRNA", 41 | "RSL": "7SL", 42 | "RSS": "5S", 43 | "DTT": "Tc1_Mariner", 44 | "DTA": "hAT", 45 | "DTM": "Mutator", 46 | "DTE": "Merlin", 47 | "DTR": "Transib", 48 | "DTP": "P", 49 | "DTB": "PiggyBac", 50 | "DTH": "PIF_Harbinger", 51 | "DTC": "CACTA", 52 | "DYC": "Crypton", 53 | "DHH": "Helitron", 54 | "DMM": "Maverick", 55 | # Custom changes 56 | "Uknown": U, 57 | "unknown": U, 58 | "Unknown": U, 59 | "EnSpm_CACTA": "CACTA" 60 | #'MuDr': 'MULE', 61 | #'MULE-MuDR': 'MULE', 62 | #'Mutator|cleanup': 'MULE', 63 | #'TcMar': U, 64 | #'Pao': U, 65 | #'Caulimovirus': U, 66 | #'hAT-Tag1': 'hAT', 67 | #'hAT-Tip100': 'hAT', 68 | #'hAT-Charlie': 'hAT', 69 | #'Helitron': U, 70 | #'Maverick': U, 71 | #'Harbinger': 'PIF_Harbinger', 72 | #'PIF-Harbinger': 'PIF_Harbinger', 73 | #'TcMar-Pogo': U, 74 | #'CR1': 'LINE', 75 | #'hAT-Ac': 'hAT', 76 | #'L2': 'LINE', 77 | #'L1': 'LINE', 78 | #'Jockey': 'LINE', 79 | #'MuLE-MuDR': 'MULE', 80 | #'MuDR': 'MULE', 81 | #'Mutator': 'MULE', 82 | #'Micro_like': U, 83 | #'Micro-like-sequence': U, 84 | #'Micro-like-sequence|cleanup': U, 85 | #'Unclassified': U, 86 | #'L1-Tx1': 'LINE', 87 | #'CRE': 'LINE', 88 | #'CACTA': 'CMC-EnSpm', 89 | #'Tad1': U, 90 | #'hAT|cleanup': 'hAT', 91 | #'': U, 92 | #'Line': 'LINE' 93 | } 94 | TE_Data.SuperFamily.fillna( 95 | value="Unknown_SuperFam", inplace=True 96 | ) # replace None w U 97 | # step to fix TE names 98 | TE_Data.Order.replace(master_order, inplace=True) 99 | TE_Data.SuperFamily.replace(master_superfamily, inplace=True) 100 | TE_Data.loc[TE_Data.Order == "Tandem", "SuperFamily"] = "Tandem" 101 | 102 | to_drop = TE_Data.Chromosome.str.contains("##sequence-region") 103 | TE_Data = TE_Data[~to_drop] 104 | to_drop = TE_Data.Chromosome.str.contains("contig*") 105 | TE_Data = TE_Data[~to_drop] 106 | 107 | TE_Data.loc[ 108 | (TE_Data["Order"] == "Unknown_Order") 109 | & (TE_Data["SuperFamily"] == "Unknown_SuperFam"), 110 | ["Order", "SuperFamily"], 111 | ] = "Completely_Unknown" 112 | 113 | TE_Data.loc[ 114 | (TE_Data["Order"] == "Helitron") 115 | & (TE_Data["SuperFamily"] == "Unknown_SuperFam"), 116 | ["SuperFamily"], 117 | ] = "Helitron" 118 | TE_Data.loc[(TE_Data["Order"] == "Helitron"), ["Order"]] = "Helitron" 119 | TE_Data.loc[(TE_Data["SuperFamily"] == "Helitron"), ["Order"]] = "Helitron" 120 | TE_Data.loc[(TE_Data["Order"] == "Mixture"), ["SuperFamily"]] = "Mixture" 121 | 122 | ltr_elements = ["Copia", "Gypsy"] 123 | TE_Data.loc[ 124 | (TE_Data["Order"] == "LTR") & (~TE_Data["SuperFamily"].isin(ltr_elements)), 125 | ["SuperFamily"], 126 | ] = "Unknown_LTR_Superfam" 127 | TE_Data = TE_Data[TE_Data.Order != "Simple_repeat"] # drop s repeat 128 | TE_Data = TE_Data[TE_Data.Order != "long_terminal_repeat"] # drop 129 | TE_Data = TE_Data[TE_Data.Order != "Maverick"] # drop if in Order category 130 | TE_Data = TE_Data[ 131 | TE_Data.Order != "target_site_duplication" 132 | ] # drop if in Order category 133 | return TE_Data 134 | -------------------------------------------------------------------------------- /examples/Blueberry_Expression/src/import_blueberry_EDTA.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filter a EDTA-created TE annotation to the appropriate format for TE 3 | Density algorithm 4 | """ 5 | 6 | __author__ = "Scott Teresi" 7 | 8 | import pandas as pd 9 | import argparse 10 | import os 11 | import logging 12 | import coloredlogs 13 | 14 | from examples.Blueberry_Expression.src.replace_names_blueberry import te_annot_renamer 15 | 16 | 17 | def check_nulls(my_df, logger): 18 | """Check the TE dataframe for ANY null values in ANY rows 19 | 20 | Args: 21 | my_df (pandas.core.DataFrame): Pandas dataframe of TE values from TE 22 | annotation 23 | """ 24 | Bool = my_df.isnull().values.any() 25 | if Bool: 26 | logger.critical("You have null values in your dataframe!") 27 | logger.critical("Here are the null values in the output:") 28 | null_columns = my_df.columns[my_df.isnull().any()] 29 | print((my_df[my_df.isnull().any(axis=1)][null_columns].head())) 30 | 31 | 32 | def write_cleaned_transposons(te_pandaframe, output_dir, genome_name, logger): 33 | file_name = os.path.join(output_dir, ("Cleaned_" + genome_name + "_EDTA_TEs.tsv")) 34 | 35 | logger.info("Writing cleaned TE file to: %s" % file_name) 36 | te_pandaframe.to_csv(file_name, sep="\t", header=True, index=False) 37 | 38 | 39 | def import_transposons(tes_input_path, te_annot_renamer, logger): 40 | """Import TE file and read as a dataframe in Pandas 41 | 42 | Args: 43 | tes_input_path (str): string of the file path to the TE annotation 44 | 45 | logger (logging obj): The object to call logs and info 46 | """ 47 | col_names = [ 48 | "Chromosome", 49 | "Software", 50 | "Feature", 51 | "Start", 52 | "Stop", 53 | "Score", 54 | "Strand", 55 | "Phase", 56 | "Attribute", 57 | ] 58 | 59 | TE_Data = pd.read_csv( 60 | tes_input_path, 61 | sep="\t+", 62 | header=None, 63 | engine="python", 64 | names=col_names, 65 | comment="#", 66 | dtype={"Start": "float64", "Stop": "float64"}, 67 | ) 68 | 69 | # Drop extraneous columns 70 | TE_Data.drop(columns=["Score", "Software", "Phase", "Feature"], inplace=True) 71 | 72 | # Create Order and SuperFamily column from Attribute column 73 | # Because that column contains the detailed TE information 74 | # Then remove old Attribute column 75 | TE_Data["Attribute"] = TE_Data["Attribute"].str.extract(r"Classification=(.*?);") 76 | TE_Data[["Order", "SuperFamily"]] = TE_Data.Attribute.str.split("/", expand=True) 77 | TE_Data.drop(columns=["Attribute"], inplace=True) 78 | TE_Data.Order = TE_Data.Order.astype(str) 79 | TE_Data.SuperFamily = TE_Data.SuperFamily.astype(str) 80 | TE_Data.Strand = TE_Data.Strand.astype(str) 81 | 82 | # Rename because Blueberry scaffolds got renamed during EDTA 83 | TE_Data["Chromosome"] = "VaccDscaff" + TE_Data["Chromosome"].astype(str) 84 | 85 | # Call renamer 86 | TE_Data = te_annot_renamer(TE_Data) 87 | 88 | # Declare data types 89 | TE_Data["Length"] = TE_Data.Stop - TE_Data.Start + 1 90 | check_nulls(TE_Data, logger) 91 | 92 | TE_Data.sort_values(by=["Chromosome", "Start"], inplace=True) 93 | 94 | # MAGIC I only want the first 48 chromosomes 95 | chromosomes_i_want = ["VaccDscaff" + str(i) for i in range(49)] 96 | TE_Data = TE_Data.loc[TE_Data["Chromosome"].isin(chromosomes_i_want)] 97 | 98 | return TE_Data 99 | 100 | 101 | if __name__ == "__main__": 102 | 103 | parser = argparse.ArgumentParser(description="Reformat TE annotation file") 104 | parser.add_argument( 105 | "TE_input_file", type=str, help="Parent path of TE annotation file" 106 | ) 107 | 108 | parser.add_argument( 109 | "output_dir", 110 | type=str, 111 | help="Parent directory to output results", 112 | ) 113 | 114 | parser.add_argument( 115 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 116 | ) 117 | 118 | args = parser.parse_args() 119 | args.TE_input_file = os.path.abspath(args.TE_input_file) 120 | args.output_dir = os.path.abspath(args.output_dir) 121 | 122 | log_level = logging.DEBUG if args.verbose else logging.INFO 123 | logger = logging.getLogger(__name__) 124 | coloredlogs.install(level=log_level) 125 | 126 | # Execute 127 | cleaned_transposons = import_transposons( 128 | args.TE_input_file, te_annot_renamer, logger 129 | ) 130 | write_cleaned_transposons(cleaned_transposons, args.output_dir, "Blueberry", logger) 131 | -------------------------------------------------------------------------------- /examples/Arabidopsis/src/import_Arabidopsis_EDTA.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filter a EDTA-created TE annotation to the appropriate format for TE 3 | Density algorithm 4 | """ 5 | 6 | __author__ = "Scott Teresi" 7 | 8 | import pandas as pd 9 | import argparse 10 | import os 11 | import logging 12 | import coloredlogs 13 | 14 | from examples.Arabidopsis.src.replace_names_Arabidopsis import te_annot_renamer 15 | 16 | 17 | def check_nulls(my_df, logger): 18 | """Check the TE dataframe for ANY null values in ANY rows 19 | 20 | Args: 21 | my_df (pandas.core.DataFrame): Pandas dataframe of TE values from TE 22 | annotation 23 | """ 24 | Bool = my_df.isnull().values.any() 25 | if Bool: 26 | logger.critical("You have null values in your dataframe!") 27 | logger.critical("Here are the null values in the output:") 28 | null_columns = my_df.columns[my_df.isnull().any()] 29 | print((my_df[my_df.isnull().any(axis=1)][null_columns].head())) 30 | 31 | 32 | def write_cleaned_transposons(te_pandaframe, output_dir, old_filename, logger): 33 | file_name = os.path.join( 34 | output_dir, 35 | ("Cleaned_" + os.path.splitext(os.path.basename(old_filename))[0]) + ".tsv", 36 | ) # MAGIC to get proper extension 37 | 38 | logger.info("Writing cleaned TE file to: %s" % file_name) 39 | te_pandaframe.to_csv(file_name, sep="\t", header=True, index=False) 40 | 41 | 42 | def import_transposons(tes_input_path, te_annot_renamer, logger): 43 | """Import TE file and read as a dataframe in Pandas 44 | 45 | Args: 46 | tes_input_path (str): string of the file path to the TE annotation 47 | 48 | logger (logging obj): The object to call logs and info 49 | """ 50 | col_names = [ 51 | "Chromosome", 52 | "Software", 53 | "Feature", 54 | "Start", 55 | "Stop", 56 | "Score", 57 | "Strand", 58 | "Phase", 59 | "Attribute", 60 | ] 61 | 62 | te_data = pd.read_csv( 63 | tes_input_path, 64 | sep="\t+", 65 | header=None, 66 | engine="python", 67 | names=col_names, 68 | comment="#", 69 | dtype={"Start": "float64", "Stop": "float64"}, 70 | ) 71 | 72 | # Drop extraneous columns 73 | te_data.drop(columns=["Score", "Software", "Phase", "Feature"], inplace=True) 74 | 75 | # Create Order and SuperFamily column from Attribute column 76 | # Because that column contains the detailed TE information 77 | # Then remove old Attribute column 78 | te_data["Attribute"] = te_data["Attribute"].str.extract(r"Classification=(.*?);") 79 | te_data[["Order", "SuperFamily"]] = te_data.Attribute.str.split("/", expand=True) 80 | te_data.drop(columns=["Attribute"], inplace=True) 81 | te_data.Order = te_data.Order.astype(str) 82 | te_data.SuperFamily = te_data.SuperFamily.astype(str) 83 | te_data.Strand = te_data.Strand.astype(str) 84 | 85 | # Call renamer 86 | te_data = te_annot_renamer(te_data) 87 | 88 | # Declare data types 89 | te_data["Length"] = te_data.Stop - te_data.Start + 1 90 | check_nulls(te_data, logger) 91 | 92 | te_data.sort_values(by=["Chromosome", "Start"], inplace=True) 93 | 94 | return te_data 95 | 96 | 97 | if __name__ == "__main__": 98 | 99 | parser = argparse.ArgumentParser(description="Reformat TE annotation file") 100 | path_main = os.path.abspath(__file__) 101 | dir_main = os.path.dirname(path_main) 102 | output_default = os.path.join( 103 | dir_main, "../../../../", "TE_Data/filtered_input_data" 104 | ) 105 | parser.add_argument( 106 | "TE_input_file", type=str, help="Parent path of TE annotation file" 107 | ) 108 | 109 | parser.add_argument( 110 | "--output_dir", 111 | "-o", 112 | type=str, 113 | default=output_default, 114 | help="Parent directory to output results", 115 | ) 116 | 117 | parser.add_argument( 118 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 119 | ) 120 | 121 | args = parser.parse_args() 122 | args.TE_input_file = os.path.abspath(args.TE_input_file) 123 | args.output_dir = os.path.abspath(args.output_dir) 124 | 125 | log_level = logging.DEBUG if args.verbose else logging.INFO 126 | logger = logging.getLogger(__name__) 127 | coloredlogs.install(level=log_level) 128 | 129 | # Execute 130 | cleaned_transposons = import_transposons( 131 | args.TE_input_file, te_annot_renamer, logger 132 | ) 133 | write_cleaned_transposons( 134 | cleaned_transposons, args.output_dir, args.TE_input_file, logger 135 | ) 136 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/Makefile: -------------------------------------------------------------------------------- 1 | # scripts for running rice synteny TE Density examples 2 | # __file__ Makefile 3 | # __author__ Scott Teresi 4 | 5 | ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 6 | DEV_GFF_READ_EXECUTABLE := /home/scott/Documents/Uni/Research/gffread 7 | #DEV_GFF_READ_EXECUTABLE := /mnt/research/edgerpat_lab/Scotty/gffread # use if on HPCC 8 | DEV_DATA := $(realpath $(ROOT_DIR)/../../../TE_Density_Example_Data/Rice) 9 | DEV_GLABERRIMA_DATA := $(DEV_DATA)/Oryza_Glaberrima 10 | DEV_GLABERRIMA_FASTA_DATA := $(DEV_GLABERRIMA_DATA)/Sequences 11 | DEV_GLABERRIMA_GENES := $(DEV_GLABERRIMA_DATA)/Genes/Oryza_glaberrima.Oryza_glaberrima_V1.50.gff3 12 | DEV_GLABERRIMA_TEs := $(DEV_GLABERRIMA_DATA)/TEs/Oryza_Glaberrima_NewNames.fasta.mod.EDTA.TEanno.gff3 13 | DEV_SATIVA_DATA := $(DEV_DATA)/Oryza_Sativa 14 | DEV_SATIVA_FASTA_DATA := $(DEV_SATIVA_DATA)/Sequences 15 | DEV_SATIVA_GENES := $(DEV_SATIVA_DATA)/Genes/Oryza_sativa.IRGSP-1.0.50.gff3 16 | DEV_SATIVA_TEs := $(DEV_SATIVA_DATA)/TEs/Oryza_Sativa_NewNames.fasta.mod.EDTA.TEanno.gff3 17 | DEV_FILTERED := $(realpath $(ROOT_DIR)/../../../TE_Data/filtered_input_data) 18 | DEV_HDF5 := $(realpath $(ROOT_DIR)/../../../TE_Data/finalized_data/10KB) 19 | DEV_RESULTS:= $(realpath $(ROOT_DIR)/results) 20 | 21 | .PHONY: create_CDS fix_fasta_names fix_CDS_names 22 | 23 | create_CDS: 24 | @echo 25 | @echo Creating CDS from GFF and fasta file for glaberrima 26 | $(DEV_GFF_READ_EXECUTABLE)/gffread -x $(DEV_GLABERRIMA_FASTA_DATA)/Oryza_glaberrima_CDS.fasta -g $(DEV_GLABERRIMA_FASTA_DATA)/Oryza_glaberrima.Oryza_glaberrima_V1.dna.toplevel.fa $(DEV_GLABERRIMA_GENES) 27 | @echo Creating CDS from GFF and fasta file for sativa 28 | $(DEV_GFF_READ_EXECUTABLE)/gffread -x $(DEV_SATIVA_FASTA_DATA)/Oryza_sativa_CDS.fasta -g $(DEV_SATIVA_FASTA_DATA)/Oryza_sativa.IRGSP-1.0.dna.toplevel.fa $(DEV_SATIVA_GENES) 29 | @echo 30 | 31 | 32 | fix_fasta_names: 33 | @echo 34 | @echo Fixing the fasta names for glaberrima so that they are not too long for EDTA 35 | python $(ROOT_DIR)/src/fix_fasta_names.py $(DEV_GLABERRIMA_FASTA_DATA)/Oryza_glaberrima.Oryza_glaberrima_V1.dna.toplevel.fa Oryza_Glaberrima 36 | @echo Fixing the fasta names for sativa so that they are not too long for EDTA 37 | python $(ROOT_DIR)/src/fix_fasta_names.py $(DEV_SATIVA_FASTA_DATA)/Oryza_sativa.IRGSP-1.0.dna.toplevel.fa Oryza_Sativa 38 | @echo 39 | 40 | 41 | 42 | fix_CDS_names: 43 | @echo 44 | @echo Fixing the CDS fasta names for glaberrima so that they are not too long for EDTA 45 | python $(ROOT_DIR)/src/fix_cds_names.py $(DEV_GLABERRIMA_FASTA_DATA)/Oryza_glaberrima_CDS.fasta Oryza_Glaberrima 46 | @echo Fixing the CDS fasta names for sativa so that they are not too long for EDTA 47 | python $(ROOT_DIR)/src/fix_cds_names.py $(DEV_SATIVA_FASTA_DATA)/Oryza_sativa_CDS.fasta Oryza_Sativa 48 | @echo 49 | 50 | run_EDTA_HPCC: 51 | @echo Running EDTA for glaberrima 52 | sbatch $(ROOT_DIR)/src/Annotate_EDTA_Rice_Glaberrima.sb 53 | @echo Running EDTA for sativa 54 | sbatch $(ROOT_DIR)/src/Annotate_EDTA_Rice_Sativa.sb 55 | 56 | 57 | filter_genes: 58 | @echo Filtering glaberrima genes into appropriate format for TE Density 59 | python $(ROOT_DIR)/src/import_rice_gene_anno.py $(DEV_GLABERRIMA_GENES) 60 | @echo Filtering sativa genes into appropriate format for TE Density 61 | python $(ROOT_DIR)/src/import_rice_gene_anno.py $(DEV_SATIVA_GENES) 62 | 63 | filter_TEs: 64 | @echo Filtering blueberry TEs into appropriate format for TE Density 65 | python $(ROOT_DIR)/src/import_rice_EDTA.py $(DEV_GLABERRIMA_TEs) 66 | python $(ROOT_DIR)/src/import_rice_EDTA.py $(DEV_SATIVA_TEs) 67 | 68 | run_TE_Density_HPCC: 69 | @echo Running TE Density for Glaberrima 70 | sbatch $(ROOT_DIR)/src/TE_Density_Glaberrima.sb 71 | sbatch $(ROOT_DIR)/src/TE_Density_Sativa.sb 72 | 73 | filter_syntelogs: 74 | @echo Filtering syntelog file from SynMap to a cleaner version for downstream analysis 75 | mkdir -p $(ROOT_DIR)/results 76 | python $(ROOT_DIR)/src/generate_pairs.py $(DEV_DATA)/SynMap_Results/Glaberrima_VS_Sativa_SynMap.tsv -o $(ROOT_DIR)/results 77 | 78 | 79 | compare_syntelog_TE_differences: 80 | @echo Generate graphs of syntelog TE differences 81 | @echo This is for chromosome 1 82 | mkdir -p $(ROOT_DIR)/results/graphs 83 | python $(ROOT_DIR)/src/compare_density.py $(DEV_RESULTS)/set_syntelogs.tsv $(DEV_HDF5)/Sativa/Sativa_1.h5 $(DEV_HDF5)/Glaberrima/Glaberrima_1.h5 $(DEV_FILTERED)/Cleaned_Oryza_sativa.IRGSP-1.0.50.tsv $(DEV_FILTERED)/Cleaned_Oryza_glaberrima.Oryza_glaberrima_V1.50.tsv -o $(ROOT_DIR)/results 84 | 85 | # TODO Rename inputs for Rice 86 | identify_interesing_genes: 87 | @echo Reporting genes with interesting levels of TE density 88 | mkdir -p $(DEV_RESULTS)/tables 89 | python $(ROOT_DIR)/src/find_abnormal_genes.py $(DEV_FILTERED)/Cleaned_Oryza_sativa.IRGSP-1.0.50.tsv $(DEV_HDF5)/Sativa -o $(DEV_RESULTS)/tables 90 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/fix_fasta_names.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Reformat Fasta files for EDTA usage 5 | """ 6 | 7 | __author__ = "Scott Teresi" 8 | 9 | import argparse 10 | import os 11 | import logging 12 | import coloredlogs 13 | from Bio import SeqIO 14 | 15 | 16 | def reformat_seq_iq(input_fasta, genome_name, output_dir, logger): 17 | """ 18 | Reformat a regular FASTA file to have shorter sequence ID names for EDTA 19 | 20 | Args: 21 | input_fasta (str): String path to input fasta file 22 | 23 | genome_name (str): String for genome name 24 | 25 | output_dir (str): Path to output dir 26 | 27 | logger (logging.Logger): Object to log information to 28 | 29 | Returns: 30 | None, just saves the edited FASTA file to disk. Also writes a 31 | conversion table to disk for the old names and their new name 32 | counterparts 33 | """ 34 | # MAGIC file suffixes 35 | new_fasta = os.path.join(output_dir, (genome_name + "_NewNames.fasta")) 36 | name_key = os.path.join(output_dir, (genome_name + "_Seq_ID_Conversion.txt")) 37 | 38 | if os.path.exists(new_fasta): 39 | os.remove(new_fasta) # remove the file because we are in append mode 40 | if os.path.exists(name_key): 41 | os.remove(name_key) 42 | pair_dict = {} # NB this is used to write the conversion key later for 43 | # clarity and note-taking 44 | 45 | # MAGIC we only want specific chromosomes 46 | chromosomes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, "Mt", "Pt"] 47 | chromosomes = [str(x) for x in chromosomes] # redefine as string 48 | 49 | count = 0 50 | with open(input_fasta, "r") as input_fasta: 51 | for s_record in SeqIO.parse(input_fasta, "fasta"): 52 | if s_record.id in chromosomes: 53 | # NB the s_record.id and s_record.description combined contain 54 | # all the information for each entry following the '>' character 55 | # in the fasta 56 | 57 | # NB In this case: 58 | # We just want the s_record.id which correctly points to the 59 | # first integer. E.g '1 dna:chromosome blah blah blah' we just want 60 | # 1. 61 | pair_dict[s_record.id] = "\t " + s_record.description 62 | s_record.description = "" # NB edit the description so that when 63 | # we rewrite we don't have the extraneous info 64 | with open(new_fasta, "a") as output: 65 | SeqIO.write(s_record, output, "fasta") 66 | else: 67 | s_record.id = "Auxiliary_" + str(count) 68 | pair_dict[s_record.id] = "\t " + s_record.description 69 | s_record.description = "" # NB edit the description so that when 70 | # we rewrite we don't have the extraneous info 71 | with open(new_fasta, "a") as output: 72 | SeqIO.write(s_record, output, "fasta") 73 | count += 1 74 | 75 | logger.info("Finished writing new fasta to: %s" % new_fasta) 76 | 77 | with open(name_key, "w") as output: 78 | for key, val in pair_dict.items(): 79 | # Write the conversion table for record-keeping. 80 | output.write("%s\t%s\n" % (key, val)) 81 | logger.info( 82 | "Finished writing name conversion table to: %s" 83 | % os.path.join(output_dir, name_key) 84 | ) 85 | 86 | 87 | if __name__ == "__main__": 88 | 89 | path_main = os.path.abspath(__file__) 90 | dir_main = os.path.dirname(path_main) 91 | parser = argparse.ArgumentParser(description="Reformat FASTA for EDTA") 92 | 93 | parser.add_argument("fasta_input_file", type=str, help="parent path of fasta file") 94 | parser.add_argument("genome_id", type=str, help="name of genome") 95 | output_default = os.path.join( 96 | dir_main, "../../../../", "TE_Density_Example_Data/Rice" 97 | ) 98 | parser.add_argument( 99 | "--output_dir", 100 | "-o", 101 | type=str, 102 | default=output_default, 103 | help="Parent directory to output results", 104 | ) 105 | parser.add_argument( 106 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 107 | ) 108 | args = parser.parse_args() 109 | args.fasta_input_file = os.path.abspath(args.fasta_input_file) 110 | args.output_dir = os.path.join(args.output_dir, args.genome_id, "Sequences") 111 | 112 | log_level = logging.DEBUG if args.verbose else logging.INFO 113 | logger = logging.getLogger(__name__) 114 | coloredlogs.install(level=log_level) 115 | 116 | reformat_seq_iq(args.fasta_input_file, args.genome_id, args.output_dir, logger) 117 | -------------------------------------------------------------------------------- /examples/Arabidopsis/src/import_Arabidopsis_gene_anno.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filter a gene annotation file for the TE Density algorithm 3 | """ 4 | 5 | __author__ = "Scott Teresi" 6 | 7 | import pandas as pd 8 | import argparse 9 | import os 10 | import logging 11 | import coloredlogs 12 | 13 | from transposon import check_nulls 14 | 15 | 16 | def write_cleaned_genes(gene_pandaframe, output_dir, old_filename, logger): 17 | file_name = os.path.join( 18 | output_dir, 19 | ("Cleaned_" + os.path.splitext(os.path.basename(old_filename))[0]) + ".tsv", 20 | ) # MAGIC to get proper extension 21 | 22 | logger.info("Writing cleaned gene file to: %s" % file_name) 23 | gene_pandaframe.to_csv(file_name, sep="\t", header=True, index=True) 24 | 25 | 26 | def import_genes(genes_input_path, logger): 27 | """Import genes file. 28 | 29 | Args: 30 | input_dir (command line argument) Specify the input directory of the gene 31 | annotation data, this is the same as the TE annotation directory 32 | """ 33 | 34 | col_names = [ 35 | "Chromosome", 36 | "Software", 37 | "Feature", 38 | "Start", 39 | "Stop", 40 | "Score", 41 | "Strand", 42 | "Frame", 43 | "FullName", 44 | ] 45 | 46 | col_to_use = [ 47 | "Chromosome", 48 | "Software", 49 | "Feature", 50 | "Start", 51 | "Stop", 52 | "Strand", 53 | "FullName", 54 | ] 55 | 56 | gene_data = pd.read_csv( 57 | genes_input_path, 58 | sep="\t+", 59 | header=None, 60 | engine="python", 61 | names=col_names, 62 | usecols=col_to_use, 63 | dtype={ 64 | "Stop": "float64", 65 | "Start": "float64", 66 | "Chromosome": str, 67 | "Strand": str, 68 | "Fullname": str, 69 | "Feature": str, 70 | "Software": str, 71 | }, 72 | comment="#", 73 | ) 74 | 75 | # rows in annotation 76 | gene_data = gene_data[gene_data.Feature == "gene"] # drop non-gene rows 77 | # gene_data.reset_index(inplace=True) # reset index so we can have proper 78 | 79 | gene_data["Gene_Name"] = gene_data["FullName"].str.extract(r"ID=(.*?);") 80 | gene_data = gene_data.drop(columns=["FullName", "Software"]) 81 | gene_data["Length"] = gene_data.Stop - gene_data.Start + 1 82 | 83 | gene_data.sort_values(by=["Chromosome", "Start"], inplace=True) 84 | check_nulls(gene_data, logger) 85 | gene_data = drop_nulls(gene_data, logger) 86 | 87 | # Set the gene name as the index 88 | gene_data.set_index("Gene_Name", inplace=True) 89 | 90 | return gene_data 91 | 92 | 93 | def get_nulls(my_df, logger): 94 | """ 95 | Print out the row IDs where the null values exist 96 | 97 | Args: 98 | my_df (Pandaframes): Pandaframe to check null values in 99 | logger 100 | """ 101 | nas = my_df[my_df.isna().any(axis=1)] 102 | logger.warning("Rows where null exist: %s" % nas) 103 | 104 | 105 | def drop_nulls(my_df, logger): 106 | """ 107 | Drop null values inside a Pandaframe 108 | 109 | Args: 110 | my_df (Pandaframes): Pandaframe to drop null values 111 | """ 112 | nas = my_df[my_df.isna().any(axis=1)] 113 | if not nas.empty: 114 | logger.warning("Dropping rows with at least one Null value!") 115 | my_df = my_df.dropna(axis=0, how="any") 116 | return my_df 117 | 118 | 119 | if __name__ == "__main__": 120 | 121 | parser = argparse.ArgumentParser(description="Reformat gene annotation file") 122 | path_main = os.path.abspath(__file__) 123 | dir_main = os.path.dirname(path_main) 124 | output_default = os.path.join( 125 | dir_main, "../../../../", "TE_Data/filtered_input_data" 126 | ) 127 | parser.add_argument( 128 | "gene_input_file", type=str, help="Parent path of gene annotation file" 129 | ) 130 | 131 | parser.add_argument( 132 | "--output_dir", 133 | "-o", 134 | type=str, 135 | default=output_default, 136 | help="Parent directory to output results", 137 | ) 138 | 139 | parser.add_argument( 140 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 141 | ) 142 | 143 | args = parser.parse_args() 144 | args.gene_input_file = os.path.abspath(args.gene_input_file) 145 | args.output_dir = os.path.abspath(args.output_dir) 146 | 147 | log_level = logging.DEBUG if args.verbose else logging.INFO 148 | logger = logging.getLogger(__name__) 149 | coloredlogs.install(level=log_level) 150 | 151 | # Execute 152 | cleaned_genes = import_genes(args.gene_input_file, logger) 153 | write_cleaned_genes(cleaned_genes, args.output_dir, args.gene_input_file, logger) 154 | -------------------------------------------------------------------------------- /transposon/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sundry helper functions. 3 | """ 4 | 5 | # FUTURE according to best practices, move all this to it's own utils namespace? 6 | 7 | import logging 8 | import errno 9 | from functools import partial 10 | from os import sysconf, strerror 11 | import os 12 | import h5py 13 | import numexpr # used by numpy 14 | 15 | MAX_SYSTEM_RAM_GB = sysconf("SC_PAGE_SIZE") * sysconf("SC_PHYS_PAGES") / (1024.0 ** 3) 16 | FILE_DNE = partial(FileNotFoundError, errno.ENOENT, strerror(errno.ENOENT)) 17 | 18 | 19 | def set_numexpr_threads(n_threads=None): 20 | """Set number of threads for use in Numpy/Pandas NumExpr. 21 | 22 | NumExpr uses a default of the numexpr.detect_number_of_cores(). 23 | This appears to be the number of hyperthreads. 24 | Calling this will prevent numexpr from making a log call at startup. 25 | """ 26 | 27 | n_threads = n_threads or numexpr.detect_number_of_cores() 28 | numexpr.set_num_threads(n_threads) 29 | 30 | 31 | def raise_if_no_file(filepath, logger=None, msg_fmt=None): 32 | """Raise FileNotFoundError if file does not exist.""" 33 | 34 | logger = logger or logging.getLogger(__name__) 35 | msg_fmt = msg_fmt or "not a file: %s" 36 | if not os.path.isfile(filepath): 37 | logger.critical(msg_fmt % filepath) 38 | raise FILE_DNE(filepath) 39 | 40 | 41 | def raise_if_no_dir(filepath, logger=None, msg_fmt=None): 42 | """Raise FileNotFoundError if file does not exist and it's not a directory.""" 43 | 44 | logger = logger or logging.getLogger(__name__) 45 | msg_fmt = "not a directory: %s" 46 | if not os.path.isdir(filepath): 47 | logger.critical(msg_fmt % filepath) 48 | raise FILE_DNE(filepath) 49 | 50 | 51 | def check_ram(ram_bytes, logger): 52 | """Raise if the requested RAM is negative or greater than the system.""" 53 | 54 | if ram_bytes < 0: 55 | logger.critical("cache %i bytes < 0" % ram_bytes) 56 | raise ValueError() 57 | if ram_bytes / (1024.0 ** 3) > MAX_SYSTEM_RAM_GB: 58 | ram_gb = ram_bytes / (1024.0 ** 3) 59 | msg = "cache %i GB > system %i GB" % ram_gb 60 | logger.critical(msg) 61 | raise ValueError(msg) 62 | 63 | 64 | def write_vlen_str_h5py(h5file, strings, dataset_key): 65 | """Write to an H5 File an iterable of variable length unicode. 66 | 67 | Args: 68 | h5file(h5py.File): opened destination file 69 | strings(iterable(str)): string list 70 | dataset_key(str): name of data set to write 71 | """ 72 | 73 | vlen = h5py.special_dtype(vlen=str) 74 | n_strings = sum(1 for s in strings) 75 | dset = h5file.create_dataset(dataset_key, (n_strings,), dtype=vlen) 76 | dset[:] = [str(s) for s in strings] 77 | 78 | 79 | def read_vlen_str_h5py(h5file, dataset_key): 80 | """Read from an H5 File an iterable of variable length unicode. 81 | 82 | Args: 83 | h5ile(h5py.File): opened source file 84 | dataset_key(str): name of data set to read 85 | """ 86 | 87 | return h5file[dataset_key][:].tolist() 88 | 89 | 90 | def check_strand(my_df, logger): 91 | """ 92 | In the gene annotation (GFF format), check to see if the user provided 93 | incoherent values and raise an error. 94 | Values should only be '-' or '+' or '.'. Will raise a logger info if '.' 95 | because we will assume that the gene is '+', because we HAVE to choose a 96 | direction for the gene to be orientated. 97 | """ 98 | acceptable_values = ["-", "+", "."] 99 | # Check if you have any values in Strand that are not part of the whitelist 100 | if ~my_df["Strand"].isin(acceptable_values).all(): 101 | unacceptable_values = my_df["Strand"].unique() 102 | unacceptable_values = [ 103 | item for item in unacceptable_values if item not in acceptable_values 104 | ] 105 | logger.critical( 106 | """ 107 | You have values in your gene annotation that do not 108 | conform to the GFF file format. Please fix this. Your unacceptable 109 | strand values are: %s 110 | """ 111 | % unacceptable_values 112 | ) 113 | raise ValueError 114 | 115 | # NOTE separate check for '.' 116 | if my_df["Strand"].isin(["."]).any(): 117 | logger.warning( 118 | """ 119 | You have rows in your gene annotation that have '.' as 120 | a Strand column value. Please note that we will treat 121 | these genes as sense orientation for the purposes of 122 | calculating TE Density. 123 | 124 | %s 125 | """ 126 | % my_df.loc[my_df["Strand"] == "."] 127 | ) 128 | 129 | 130 | def check_nulls(my_df, logger): 131 | """ 132 | Check for rows with an null values in the supplied Pandas DataFrame 133 | 134 | Args: 135 | my_df (Pandas DataFrame): 136 | 137 | logger (): 138 | 139 | """ 140 | # NB check for NAN and report to user 141 | nas = my_df[my_df.isna().any(axis=1)] 142 | if not nas.empty: 143 | logger.warning("Rows where null exist: %s" % nas) 144 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/import_rice_EDTA.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filter a EDTA-created TE annotation to the appropriate format for TE 3 | Density algorithm 4 | """ 5 | 6 | __author__ = "Scott Teresi" 7 | 8 | import pandas as pd 9 | import argparse 10 | import os 11 | import logging 12 | import coloredlogs 13 | 14 | from examples.Rice_Synteny.src.replace_names_rice import te_annot_renamer 15 | 16 | 17 | def check_nulls(my_df, logger): 18 | """Check the TE dataframe for ANY null values in ANY rows 19 | 20 | Args: 21 | my_df (pandas.core.DataFrame): Pandas dataframe of TE values from TE 22 | annotation 23 | """ 24 | Bool = my_df.isnull().values.any() 25 | if Bool: 26 | logger.critical("You have null values in your dataframe!") 27 | logger.critical("Here are the null values in the output:") 28 | null_columns = my_df.columns[my_df.isnull().any()] 29 | logger.info(my_df[my_df.isnull().any(axis=1)][null_columns].head()) 30 | 31 | 32 | def write_cleaned_transposons(te_pandaframe, output_dir, old_filename, logger): 33 | file_name = os.path.join( 34 | output_dir, 35 | ("Cleaned_" + os.path.splitext(os.path.basename(old_filename))[0]) + ".tsv", 36 | ) # MAGIC to get proper extension 37 | 38 | logger.info("Writing cleaned TE file to: %s" % file_name) 39 | te_pandaframe.to_csv(file_name, sep="\t", header=True, index=False) 40 | 41 | 42 | def import_transposons(tes_input_path, te_annot_renamer, logger): 43 | """Import TE file and read as a dataframe in Pandas 44 | 45 | Args: 46 | tes_input_path (str): string of the file path to the TE annotation 47 | 48 | logger (logging obj): The object to call logs and info 49 | """ 50 | col_names = [ 51 | "Chromosome", 52 | "Software", 53 | "Feature", 54 | "Start", 55 | "Stop", 56 | "Score", 57 | "Strand", 58 | "Phase", 59 | "Attribute", 60 | ] 61 | 62 | te_pandaframe = pd.read_csv( 63 | tes_input_path, 64 | sep="\t+", 65 | header=None, 66 | engine="python", 67 | names=col_names, 68 | comment="#", 69 | dtype={"Start": "float64", "Stop": "float64"}, 70 | ) 71 | 72 | # Drop extraneous columns 73 | te_pandaframe.drop(columns=["Score", "Software", "Phase", "Feature"], inplace=True) 74 | 75 | # Create Order and SuperFamily column from Attribute column 76 | # Because that column contains the detailed TE information 77 | # Then remove old Attribute column 78 | te_pandaframe["Attribute"] = te_pandaframe["Attribute"].str.extract( 79 | r"Classification=(.*?);" 80 | ) 81 | te_pandaframe[["Order", "SuperFamily"]] = te_pandaframe.Attribute.str.split( 82 | "/", expand=True 83 | ) 84 | te_pandaframe.drop(columns=["Attribute"], inplace=True) 85 | te_pandaframe.Order = te_pandaframe.Order.astype(str) 86 | te_pandaframe.SuperFamily = te_pandaframe.SuperFamily.astype(str) 87 | te_pandaframe.Strand = te_pandaframe.Strand.astype(str) 88 | 89 | # Call renamer 90 | te_pandaframe = te_annot_renamer(te_pandaframe) 91 | 92 | # Declare data types 93 | te_pandaframe["Length"] = te_pandaframe.Stop - te_pandaframe.Start + 1 94 | check_nulls(te_pandaframe, logger) 95 | 96 | te_pandaframe.sort_values(by=["Chromosome", "Start"], inplace=True) 97 | 98 | # MAGIC I only want the first 12 chromosomes 99 | chromosomes_i_want = [str(i) for i in range(1, 12 + 1)] # MAGIC plus 1 bc range 100 | # NB, chromosomes_i_want must be string 101 | te_pandaframe = te_pandaframe.loc[ 102 | te_pandaframe["Chromosome"].isin(chromosomes_i_want) 103 | ] 104 | te_pandaframe.sort_values(by=["Chromosome", "Start"], inplace=True) 105 | 106 | return te_pandaframe 107 | 108 | 109 | if __name__ == "__main__": 110 | 111 | parser = argparse.ArgumentParser(description="Reformat TE annotation file") 112 | path_main = os.path.abspath(__file__) 113 | dir_main = os.path.dirname(path_main) 114 | output_default = os.path.join( 115 | dir_main, "../../../../", "TE_Data/filtered_input_data" 116 | ) 117 | parser.add_argument( 118 | "TE_input_file", type=str, help="Parent path of TE annotation file" 119 | ) 120 | 121 | parser.add_argument( 122 | "--output_dir", 123 | "-o", 124 | type=str, 125 | default=output_default, 126 | help="Parent directory to output results", 127 | ) 128 | 129 | parser.add_argument( 130 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 131 | ) 132 | 133 | args = parser.parse_args() 134 | args.TE_input_file = os.path.abspath(args.TE_input_file) 135 | args.output_dir = os.path.abspath(args.output_dir) 136 | 137 | log_level = logging.DEBUG if args.verbose else logging.INFO 138 | logger = logging.getLogger(__name__) 139 | coloredlogs.install(level=log_level) 140 | 141 | # Execute 142 | cleaned_transposons = import_transposons( 143 | args.TE_input_file, te_annot_renamer, logger 144 | ) 145 | write_cleaned_transposons( 146 | cleaned_transposons, args.output_dir, args.TE_input_file, logger 147 | ) 148 | -------------------------------------------------------------------------------- /tests/test_transposon_single_chrom.tsv: -------------------------------------------------------------------------------- 1 | Fvb1-1 RepeatMasker DNA/Mutator 3350 4133 5.0 + 4992 Fxa_V1_4_26930 2 | Fvb1-1 RepeatMasker LTR/Gypsy 4209 4873 11.7 - 2969 Fvb4-4:6485593..6486917_LTR 3 | Fvb1-1 RepeatMasker LTR/Gypsy 4871 5232 16.0 - 1185 Fvb4-4:6485593..6486917_LTR 4 | Fvb1-1 RepeatMasker LTR/unknown 15404 16346 19.1 - 3262 Fvb1-1:9463282..9465926_LTR 5 | Fvb1-1 RepeatMasker LTR/unknown 16347 20224 3.8 - 30004 Fvb3-2:23517681..23521581_INT-int 6 | Fvb1-1 RepeatMasker LTR/unknown 20225 22848 15.4 - 12678 Fvb1-1:9463282..9465926_LTR 7 | Fvb1-1 RepeatMasker LTR/Gypsy 23450 24094 20.3 - 2636 Fvb1-1:5519794..5522056_LTR 8 | Fvb1-1 RepeatMasker LTR/unknown 24133 24415 19.0 - 1082 Fvb1-3:25198782..25200749_LTR 9 | Fvb1-1 RepeatMasker DNA/CMC-EnSpm 24412 24910 8.5 + 2257 family-72 10 | Fvb1-1 RepeatMasker DNA/CMC-EnSpm 25024 25752 18.8 + 1671 family-72 11 | Fvb1-1 RepeatMasker DNA/CMC-EnSpm 25751 25840 21.4 + 351 family-72 12 | Fvb1-1 RepeatMasker LTR/unknown 25842 27393 5.9 + 10968 Fvb6-4:17811941..17814568_LTR 13 | Fvb1-1 RepeatMasker LTR/Gypsy 27398 27999 19.6 + 1680 Fvb4-2:14377692..14383914_LTR 14 | Fvb1-1 RepeatMasker LTR/Gypsy 28097 28708 21.1 + 1651 Fvb4-2:14377692..14383914_LTR 15 | Fvb1-1 RepeatMasker LTR/unknown 28709 29048 4.4 - 2681 Fvb6-3:14035592..14036822_LTR 16 | Fvb1-1 RepeatMasker LTR/unknown 29051 30034 5.9 - 7242 Fvb7-3:21810512..21811992_LTR 17 | Fvb1-1 RepeatMasker LTR/unknown 30038 30447 20.0 + 1455 Fvb1-2:27872851..27875385_LTR 18 | Fvb1-1 RepeatMasker LTR/Gypsy 30440 31593 15.4 - 5517 Fvb1-1:1901529..1903797_LTR 19 | Fvb1-1 RepeatMasker LTR/Gypsy 30986 31928 12.8 + 3981 Fvb1-2:23736536..23738711_LTR 20 | Fvb1-1 RepeatMasker LTR/Gypsy 32011 33581 7.7 + 11550 Fvb6-4:12909946..12918059_INT-int 21 | Fvb1-1 RepeatMasker LTR/unknown 33582 33661 11.2 - 524 Fvb4-1:5166560..5169132_LTR 22 | Fvb1-1 RepeatMasker LTR/unknown 33646 33720 17.3 + 409 Fvb1-3:21882267..21885047_LTR 23 | Fvb1-1 RepeatMasker LTR/Gypsy 33721 34100 11.8 + 2503 Fvb4-2:19625958..19630035_INT-int 24 | Fvb1-1 RepeatMasker LTR/Gypsy 34101 36264 20.8 + 7112 Fvb3-1:4132469..4134698_LTR 25 | Fvb1-1 RepeatMasker LTR/Gypsy 36265 38219 20.9 + 6733 Fvb5-1:12085737..12087980_LTR 26 | Fvb1-1 RepeatMasker LTR/Gypsy 38514 40380 19.2 + 6488 Fvb6-2:9966870..9969479_LTR 27 | Fvb1-1 RepeatMasker LTR/unknown 40383 41342 5.3 - 7215 Fvb1-2:20827613..20830221_LTR 28 | Fvb1-1 RepeatMasker LTR/Gypsy 41343 41699 12.5 - 7196 Fvb1-1:1901529..1903797_LTR 29 | Fvb1-1 RepeatMasker LTR/Gypsy 41469 42958 8.3 + 9764 Fvb1-4:19462751..19465717_LTR 30 | Fvb1-1 RepeatMasker LTR/unknown 42959 45118 8.4 + 14420 Fvb6-4:10839685..10850190_INT-int 31 | Fvb1-1 RepeatMasker LTR/unknown 45111 45191 3.7 + 522 Fvb4-3:22936782..22939826_LTR 32 | Fvb1-1 RepeatMasker LTR/unknown 45192 46897 2.9 + 13508 Fvb1-2:20527699..20531500_INT-int 33 | Fvb1-1 RepeatMasker LTR/unknown 46898 46996 2.0 - 816 Fvb7-1:9481811..9484274_LTR 34 | Fvb1-1 RepeatMasker LTR/Gypsy 47028 47165 21.3 - 248 Fvb6-2:18470852..18482588_INT-int 35 | Fvb1-1 RepeatMasker LTR/unknown 47817 47844 21.9 + 503 Fvb5-3:23020132..23022075_LTR 36 | Fvb1-1 RepeatMasker LTR/unknown 47845 49893 24.3 - 2950 Fvb1-3:25198782..25200749_LTR 37 | Fvb1-1 RepeatMasker LTR/Gypsy 49946 50601 24.6 + 1906 Fvb1-1:5519794..5522056_LTR 38 | Fvb1-1 RepeatMasker LTR/unknown 50156 50664 29.4 + 935 Fvb1-2:20525092..20527698_LTR 39 | Fvb1-1 RepeatMasker LTR/unknown 50657 51121 12.8 + 2484 Fvb1-2:20525092..20527698_LTR 40 | Fvb1-1 RepeatMasker LTR/unknown 51147 51699 19.2 + 1656 Fvb4-3:22936782..22939826_LTR 41 | Fvb1-1 RepeatMasker LTR/unknown 51286 51946 17.5 - 2412 Fvb1-2:20827613..20830221_LTR 42 | Fvb1-1 RepeatMasker LTR/unknown 51947 52077 23.1 + 450 Fvb5-3:23020132..23022075_LTR 43 | Fvb1-1 RepeatMasker LTR/Gypsy 52286 54424 8.6 + 13432 Fvb4-4:12509905..12512100_LTR 44 | Fvb1-1 RepeatMasker LTR/Gypsy 55202 55513 14.0 - 1651 Fvb4-3:23640595..23654748_INT-int 45 | Fvb1-1 RepeatMasker DNA/CMC-EnSpm 55515 56340 4.8 + 3890 family-111 46 | Fvb1-1 RepeatMasker DNA/CMC-EnSpm 56341 59966 3.0 + 29386 family-2766 47 | Fvb1-1 RepeatMasker LTR/Copia 59974 60078 13.5 + 272 Fvb4-4:16016909..16022681_INT-int 48 | Fvb1-1 RepeatMasker LTR/Gypsy 59992 60163 4.7 - 1301 Fvb5-4:23958992..23969121_INT-int 49 | Fvb1-1 RepeatMasker LTR/unknown 60093 60476 27.1 + 1722 Fvb4-3:17071041..17071685_LTR 50 | Fvb1-1 RepeatMasker LTR/Gypsy 60477 60942 31.8 - 700 Fvb7-3:16501565..16510482_INT-int 51 | Fvb1-1 RepeatMasker LTR/unknown 60943 61397 27.1 + 1722 Fvb4-3:17071041..17071685_LTR 52 | Fvb1-1 RepeatMasker LTR/Gypsy 61398 61628 11.8 + 931 Fvb2-1:23595237..23608764_INT-int 53 | Fvb1-1 RepeatMasker LTR/Copia 61404 61441 27.0 + 423 Fvb6-3:43298293..43305898_INT-int 54 | Fvb1-1 RepeatMasker LTR/unknown 61442 61648 10.0 - 853 Fvb6-3:34524870..34533364_INT-int 55 | Fvb1-1 RepeatMasker LTR/unknown 61530 63620 4.4 + 14894 Fvb7-1:23134486..23142785_INT-int 56 | Fvb1-1 RepeatMasker LTR/unknown 63614 63848 4.3 + 1862 Fvb7-1:23134486..23142785_INT-int 57 | Fvb1-1 RepeatMasker LTR/unknown 63849 64128 2.9 - 2311 Fvb1-1:2391630..2394332_LTR 58 | Fvb1-1 RepeatMasker LTR/Gypsy 64114 65314 1.3 - 9433 Fvb1-1:6599210..6612319_INT-int 59 | Fvb1-1 RepeatMasker LTR/unknown 65249 65323 8.0 - 411 Fvb5-4:16009682..16017344_INT-int 60 | Fvb1-1 RepeatMasker LTR/Gypsy 65312 65583 11.8 + 1560 Fvb6-2:9966870..9969479_LTR 61 | Fvb1-1 RepeatMasker LTR/unknown 65584 66946 1.3 + 11377 Fvb1-2:10116627..10126059_INT-int 62 | Fvb1-1 RepeatMasker LTR/Gypsy 66939 67226 11.7 + 1300 Fvb6-3:20161303..20161571_LTR 63 | -------------------------------------------------------------------------------- /examples/Human/src/import_human_gene_anno.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filter a gene annotation file for the TE Density algorithm 3 | """ 4 | 5 | __author__ = "Scott Teresi" 6 | 7 | import pandas as pd 8 | import argparse 9 | import os 10 | import logging 11 | import coloredlogs 12 | 13 | from transposon import check_nulls 14 | 15 | 16 | def write_cleaned_genes(gene_pandaframe, output_dir, genome_name, logger): 17 | file_name = os.path.join( 18 | output_dir, ("Cleaned_Chr7_13_" + genome_name + "_Genes.tsv") 19 | ) 20 | 21 | logger.info("Writing cleaned gene file to: %s" % file_name) 22 | gene_pandaframe.to_csv(file_name, sep="\t", header=True, index=True) 23 | 24 | 25 | def import_genes(genes_input_path, logger): 26 | """Import genes file. 27 | 28 | Args: 29 | input_dir (command line argument) Specify the input directory of the gene 30 | annotation data, this is the same as the TE annotation directory 31 | """ 32 | 33 | col_names = [ 34 | "Chromosome", 35 | "Software", 36 | "Feature", 37 | "Start", 38 | "Stop", 39 | "Score", 40 | "Strand", 41 | "Frame", 42 | "FullName", 43 | ] 44 | 45 | col_to_use = [ 46 | "Chromosome", 47 | "Software", 48 | "Feature", 49 | "Start", 50 | "Stop", 51 | "Strand", 52 | "FullName", 53 | ] 54 | 55 | gene_data = pd.read_csv( 56 | genes_input_path, 57 | sep="\t+", 58 | header=None, 59 | engine="python", 60 | names=col_names, 61 | usecols=col_to_use, 62 | dtype={ 63 | "Stop": "float64", 64 | "Start": "float64", 65 | "Chromosome": str, 66 | "Strand": str, 67 | "Fullname": str, 68 | "Feature": str, 69 | "Software": str, 70 | }, 71 | comment="#", 72 | ) 73 | 74 | # rows in annotation 75 | gene_data = gene_data[gene_data.Feature == "gene"] # drop non-gene rows 76 | gene_data["Gene_Name"] = gene_data["FullName"].str.extract(r";gene_name=(.*?);") 77 | 78 | # NOTE 79 | gene_data.drop_duplicates(subset=["Gene_Name"], keep=False, inplace=True) 80 | # Drop duplicate gene names for the human data set, will add explicit 81 | # function to fix duplicate gene names in future so that the code doesn't 82 | # crash (Crashes TE_Density due to shape error). TODO add error handling 83 | # and elegant renaming function to fix duplicate gene names, not relevant 84 | # to this example though 85 | 86 | gene_data = gene_data.drop(columns=["FullName", "Software"]) 87 | 88 | gene_data.Strand = gene_data.Strand.astype(str) 89 | 90 | gene_data["Length"] = gene_data.Stop - gene_data.Start + 1 91 | 92 | # MAGIC I only want the 7th and 13th chromosome 93 | chromosomes_i_want = ["chr7", "chr13"] 94 | gene_data = gene_data.loc[gene_data["Chromosome"].isin(chromosomes_i_want)] 95 | 96 | gene_data.sort_values(by=["Chromosome", "Start"], inplace=True) 97 | check_nulls(gene_data, logger) 98 | gene_data = drop_nulls(gene_data, logger) 99 | gene_data.set_index("Gene_Name", inplace=True) 100 | 101 | return gene_data 102 | 103 | 104 | def drop_nulls(my_df, logger): 105 | """ 106 | Drop null values inside a PandaFrame 107 | 108 | Args: 109 | my_df (Pandas.Data.Frame): 110 | 111 | Returns: 112 | my_df (Pandas.Data.Frame): Without the offending rows containing null 113 | values, also has a call to the logger to show the user which rows were 114 | gotten rid of 115 | 116 | """ 117 | nas = my_df[my_df.isna().any(axis=1)] 118 | if not nas.empty: 119 | logger.warning("Dropping rows with at least one Null value!") 120 | my_df = my_df.dropna(axis=0, how="any") 121 | return my_df 122 | 123 | 124 | def return_duplicate_indices(dataframe): 125 | return dataframe[dataframe.index.duplicated(keep=False)] 126 | 127 | 128 | if __name__ == "__main__": 129 | 130 | parser = argparse.ArgumentParser(description="Reformat gene annotation file") 131 | path_main = os.path.abspath(__file__) 132 | dir_main = os.path.dirname(path_main) 133 | output_default = os.path.join( 134 | dir_main, "../../../../", "TE_Data/filtered_input_data" 135 | ) 136 | 137 | parser.add_argument( 138 | "gene_input_file", type=str, help="Parent path of gene annotation file" 139 | ) 140 | 141 | parser.add_argument( 142 | "--output_dir", 143 | "-o", 144 | type=str, 145 | default=output_default, 146 | help="Parent directory to output results", 147 | ) 148 | 149 | parser.add_argument( 150 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 151 | ) 152 | 153 | args = parser.parse_args() 154 | args.gene_input_file = os.path.abspath(args.gene_input_file) 155 | args.output_dir = os.path.abspath(args.output_dir) 156 | 157 | log_level = logging.DEBUG if args.verbose else logging.INFO 158 | logger = logging.getLogger(__name__) 159 | coloredlogs.install(level=log_level) 160 | 161 | # Execute 162 | cleaned_genes = import_genes(args.gene_input_file, logger) 163 | write_cleaned_genes(cleaned_genes, args.output_dir, "Human", logger) 164 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/find_abnormal_genes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | 3 | """ 4 | Execute graphing commands 5 | """ 6 | 7 | __author__ = "Scott Teresi" 8 | 9 | import argparse 10 | import os 11 | import logging 12 | import coloredlogs 13 | import numpy as np 14 | import pandas as pd 15 | 16 | from transposon.gene_data import GeneData 17 | from transposon.density_data import DensityData 18 | from transposon.import_filtered_genes import import_filtered_genes 19 | 20 | 21 | if __name__ == "__main__": 22 | path_main = os.path.abspath(__file__) 23 | dir_main = os.path.dirname(path_main) 24 | output_default = os.path.abspath(os.path.join(dir_main, "../", "results/graphs")) 25 | parser = argparse.ArgumentParser(description="generate graphs") 26 | 27 | parser.add_argument( 28 | "o_sativa_gene_data", 29 | type=str, 30 | help="parent path to Arabidopsis' filtered gene data file", 31 | ) 32 | 33 | parser.add_argument( 34 | "sativa_density_data_dir", 35 | type=str, 36 | help="Parent path of folders containing TE Density results", 37 | ) 38 | 39 | parser.add_argument( 40 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 41 | ) 42 | parser.add_argument( 43 | "--output_dir", 44 | "-o", 45 | type=str, 46 | default=output_default, 47 | help="parent directory to output results", 48 | ) 49 | args = parser.parse_args() 50 | args.o_sativa_gene_data = os.path.abspath(args.o_sativa_gene_data) 51 | args.sativa_density_data_dir = os.path.abspath(args.sativa_density_data_dir) 52 | log_level = logging.DEBUG if args.verbose else logging.INFO 53 | logger = logging.getLogger(__name__) 54 | coloredlogs.install(level=log_level) 55 | 56 | # Begin reading files: 57 | # Get the genes: 58 | cleaned_genes = import_filtered_genes(args.o_sativa_gene_data, logger) 59 | gene_dataframe_list = [ 60 | dataframe for k, dataframe in cleaned_genes.groupby("Chromosome") 61 | ] 62 | # NB MAGIC get unique chromosome ID 63 | gene_data_list = [ 64 | GeneData(dataframe, dataframe["Chromosome"].unique()[0]) 65 | for dataframe in gene_dataframe_list 66 | ] 67 | processed_dd_data = DensityData.from_list_gene_data_and_hdf5_dir( 68 | gene_data_list, args.sativa_density_data_dir, "Sativa_(.*?).h5", logger 69 | ) 70 | 71 | # NOTE 72 | cleaned_genes.reset_index(inplace=True) # necessary 73 | # cleaned_genes = cleaned_genes.loc[cleaned_genes["Chromosome"] == "1"] 74 | to_concat = [] 75 | for chrom, dataframe in cleaned_genes.groupby(["Chromosome"]): 76 | for processed_dd_datum in processed_dd_data: 77 | if processed_dd_datum.unique_chromosome_id == chrom: 78 | x = processed_dd_datum.add_hdf5_indices_to_gene_data(dataframe) 79 | to_concat.append(x) 80 | gene_frame_with_indices = pd.concat(to_concat) 81 | 82 | # NOTE 83 | to_concat = [] 84 | for chrom, dataframe in gene_frame_with_indices.groupby(["Chromosome"]): 85 | for processed_dd_datum in processed_dd_data: 86 | if processed_dd_datum.unique_chromosome_id == chrom: 87 | x = processed_dd_datum.add_te_vals_to_gene_info_pandas( 88 | dataframe, "Order", "LTR", "Upstream", 1000 89 | ) 90 | to_concat.append(x) 91 | gene_frame_w_ind_te_vals = pd.concat(to_concat) 92 | 93 | upper_cutoff_val = np.percentile(gene_frame_w_ind_te_vals["LTR_1000_Upstream"], 99) 94 | lower_cutoff_val = np.percentile(gene_frame_w_ind_te_vals["LTR_1000_Upstream"], 1) 95 | 96 | genes_meeting_upper_cutoff = gene_frame_w_ind_te_vals.loc[ 97 | gene_frame_w_ind_te_vals["LTR_1000_Upstream"] >= upper_cutoff_val 98 | ]["Gene_Name"].to_list() 99 | 100 | upper_cutoff_len = len(genes_meeting_upper_cutoff) 101 | 102 | print(f"Upper Cutoff Length: {upper_cutoff_len}") 103 | print(f"Upper Cutoff Val: {upper_cutoff_val}") 104 | 105 | print(lower_cutoff_val) 106 | genes_meeting_lower_cutoff = gene_frame_w_ind_te_vals.loc[ 107 | gene_frame_w_ind_te_vals["LTR_1000_Upstream"] >= lower_cutoff_val 108 | ]["Gene_Name"].to_list() 109 | print(len(genes_meeting_lower_cutoff)) # NOTE this is 37741 genes when you 110 | # faithfully apply the cutoff 111 | 112 | # Take random sample equal to the length of the gene array of upper values. 113 | # Have to take a random sample because data not normally distributed and 114 | # the cutoff value for the 1st percentile is so low you would actually get 115 | # way too many genes if you actually applied it. 116 | genes_meeting_lower_cutoff = np.random.choice( 117 | gene_frame_w_ind_te_vals.loc[ 118 | gene_frame_w_ind_te_vals["LTR_1000_Upstream"] >= lower_cutoff_val 119 | ]["Gene_Name"].to_list(), 120 | upper_cutoff_len, 121 | replace=False, 122 | ) 123 | 124 | # NOTE begin writing all the values 125 | filename_to_write = os.path.join( 126 | args.output_dir, 127 | str("Upper_Sample_LTR_1000_Upstream" + ".tsv"), 128 | ) 129 | with open( 130 | filename_to_write, 131 | "w", 132 | ) as f_out: 133 | 134 | for gene in genes_meeting_upper_cutoff: 135 | f_out.write(gene + "\n") 136 | logger.info("Writing to: %s" % filename_to_write) 137 | 138 | filename_to_write = os.path.join( 139 | args.output_dir, 140 | str("Lower_Sample_LTR_1000_Upstream" + ".tsv"), 141 | ) 142 | with open( 143 | filename_to_write, 144 | "w", 145 | ) as f_out: 146 | 147 | for gene in genes_meeting_lower_cutoff: 148 | f_out.write(gene + "\n") 149 | logger.info("Writing to: %s" % filename_to_write) 150 | -------------------------------------------------------------------------------- /examples/Human/src/replace_human_TE_names.py: -------------------------------------------------------------------------------- 1 | def te_annot_renamer(transposon_data): 2 | # TODO clean up all comments 3 | 4 | print() 5 | print(transposon_data.shape) 6 | print(transposon_data["Order"].unique()) 7 | print(transposon_data["SuperFamily"].unique()) 8 | print() 9 | # transposon_data.SuperFamily.fillna( 10 | # value="Unknown_SuperFam", inplace=True 11 | # ) # replace None w U 12 | # step to fix TE names 13 | # transposon_data.Order.replace(master_order, inplace=True) 14 | # transposon_data.SuperFamily.replace(master_superfamily, inplace=True) 15 | # transposon_data.loc[transposon_data.Order == "Tandem", "SuperFamily"] = "Tandem" 16 | 17 | ################################################# 18 | # Make Penelope its own Order called PLE, make the SuperFamily Penelope 19 | # Corresponds to Wicker's grouping of Penelope elements 20 | transposon_data.loc[ 21 | (transposon_data["Order"] == "LINE") 22 | & (transposon_data["SuperFamily"] == "Penelope"), 23 | ["Order"], 24 | ] = "PLE" 25 | 26 | # Drop CR1 elements from dataset, not in Wicker 27 | # transposon_data = transposon_data[transposon_data.SuperFamily != "CR1"] 28 | 29 | # Drop Dong-R4 elements from dataset, not in Wicker 30 | # transposon_data = transposon_data[transposon_data.SuperFamily != "Dong-R4"] 31 | 32 | # Drop all RNA related TE Orders 33 | transposon_data = transposon_data[transposon_data.Order != "snRNA"] 34 | transposon_data = transposon_data[transposon_data.Order != "tRNA"] 35 | transposon_data = transposon_data[transposon_data.Order != "rRNA"] 36 | transposon_data = transposon_data[transposon_data.Order != "rRNA"] 37 | transposon_data = transposon_data[transposon_data.Order != "srpRNA"] 38 | transposon_data = transposon_data[transposon_data.Order != "scRNA"] 39 | 40 | # Remove ERV 41 | transposon_data = transposon_data[transposon_data.SuperFamily != "ERVL-maLR"] 42 | transposon_data = transposon_data[transposon_data.SuperFamily != "ERV1"] 43 | transposon_data = transposon_data[transposon_data.SuperFamily != "ERVL"] 44 | transposon_data = transposon_data[transposon_data.SuperFamily != "ERVK"] 45 | transposon_data = transposon_data[transposon_data.SuperFamily != "ERV1?"] 46 | transposon_data = transposon_data[transposon_data.SuperFamily != "ERVL?"] 47 | # Remove low confidence and other minor/misc things 48 | transposon_data = transposon_data[transposon_data.Order != "LTR?"] 49 | transposon_data = transposon_data[transposon_data.Order != "SINE?"] 50 | transposon_data = transposon_data[transposon_data.Order != "RC?"] 51 | transposon_data = transposon_data[transposon_data.Order != "DNA?"] 52 | transposon_data = transposon_data[transposon_data.Order != "RNA"] 53 | transposon_data = transposon_data[transposon_data.Order != "Low_complexity"] 54 | transposon_data = transposon_data[transposon_data.Order != "RC"] 55 | transposon_data = transposon_data[transposon_data.Order != "Satellite"] 56 | transposon_data = transposon_data[transposon_data.Order != "Simple_repeat"] 57 | transposon_data = transposon_data[transposon_data.SuperFamily != "PiggyBac?"] 58 | transposon_data = transposon_data[transposon_data.SuperFamily != "Gypsy?"] 59 | transposon_data = transposon_data[transposon_data.SuperFamily != "TcMar?"] 60 | transposon_data = transposon_data[transposon_data.SuperFamily != "hAT?"] 61 | transposon_data = transposon_data[transposon_data.SuperFamily != "hAT-Tip100?"] 62 | 63 | # Remove elements that are DNA as superfamily 64 | transposon_data = transposon_data[transposon_data.SuperFamily != "DNA"] 65 | 66 | # Rename DNA order to TIR 67 | transposon_data.loc[(transposon_data["Order"] == "DNA"), ["Order"]] = "TIR" 68 | 69 | ###################################################3 70 | # to_drop = transposon_data.Chromosome.str.contains('##sequence-region') 71 | # transposon_data = transposon_data[~to_drop] 72 | # to_drop = transposon_data.Chromosome.str.contains('contig*') 73 | # transposon_data = transposon_data[~to_drop] 74 | 75 | # transposon_data.loc[ 76 | # (transposon_data["Order"] == "Unknown_Order") 77 | # & (transposon_data["SuperFamily"] == "Unknown_SuperFam"), 78 | # ["Order", "SuperFamily"], 79 | # ] = "Completely_Unknown" 80 | 81 | # transposon_data.loc[ 82 | # (transposon_data["Order"] == "Helitron") 83 | # & (transposon_data["SuperFamily"] == "Unknown_SuperFam"), 84 | # ["SuperFamily"], 85 | # ] = "Helitron" 86 | # transposon_data.loc[(transposon_data["SuperFamily"] == "Helitron"), ["Order"]] = "Helitron" 87 | # transposon_data.loc[(transposon_data["Order"] == "Mixture"), ["SuperFamily"]] = "Mixture" 88 | 89 | # ltr_elements = ["Copia", "Gypsy"] 90 | # transposon_data.loc[ 91 | # (transposon_data["Order"] == "LTR") & (~transposon_data["SuperFamily"].isin(ltr_elements)), 92 | # ["SuperFamily"], 93 | # ] = "Unknown_LTR_Superfam" 94 | 95 | # transposon_data = transposon_data[transposon_data.Order != "long_terminal_repeat"] # drop 96 | # transposon_data = transposon_data[transposon_data.Order != "Maverick"] # drop if in Order category 97 | # transposon_data = transposon_data[ 98 | # transposon_data.Order != "target_site_duplication" 99 | # ] # drop if in Order category 100 | # print(transposon_data[transposon_data["Order"] == "LINE"]["SuperFamily"].unique()) 101 | 102 | # If TE Order is LINE and the SuperFamily is not in Wicker 2007 103 | # No CR1 superfamily in Fig 1 104 | # print(transposon_data[transposon_data["SuperFamily"] == "CR1"]) 105 | print() 106 | print(transposon_data.shape) 107 | print(transposon_data["Order"].unique()) 108 | print(transposon_data["SuperFamily"].unique()) 109 | print() 110 | 111 | return transposon_data 112 | -------------------------------------------------------------------------------- /examples/general_read_density_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | 3 | """ 4 | Barebones initialization of DensityData class intended for demonstration 5 | purposes. 6 | """ 7 | 8 | __author__ = "Scott Teresi" 9 | 10 | import pandas as pd 11 | import numpy as np 12 | import os 13 | import argparse 14 | import logging 15 | import coloredlogs 16 | 17 | from transposon.import_filtered_genes import import_filtered_genes 18 | from transposon.gene_data import GeneData 19 | from transposon.density_data import DensityData 20 | 21 | from transposon.density_utils import ( 22 | add_hdf5_indices_to_gene_data_from_list_hdf5, 23 | add_te_vals_to_gene_info_pandas_from_list_hdf5, 24 | add_te_vals_to_gene_info_pandas, 25 | get_specific_slice, 26 | add_hdf5_indices_to_gene_data, 27 | info_of_gene, 28 | ) 29 | 30 | 31 | def get_gene_data_as_list(cleaned_genes): 32 | """ 33 | Take a cleaned genes annotation file from TE Density 34 | (import_filtered_genes) and break it into a list of GeneData objects by 35 | chromosome ID. This is used to initialize all of the DensityData objects in 36 | a list. 37 | 38 | Args: 39 | cleaned_genes (pandas.DataFrame) 40 | Index: 41 | Name: Gene_Name, strings of gene names 42 | Columns: 43 | Name: Chromosome, object 44 | Name: Feature, object 45 | Name: Start, float64 46 | Name: Stop, float64 47 | Name: Strand, object 48 | Name: Length, float64 49 | 50 | Returns: 51 | genedata_list (list of GeneData) 52 | """ 53 | # MAGIC group by column Chromosome 54 | gene_dataframe_list = [ 55 | dataframe for k, dataframe in cleaned_genes.groupby("Chromosome") 56 | ] 57 | 58 | # MAGIC initialize GeneData iteratively using the magic unique chromosome 59 | genedata_list = [ 60 | GeneData(dataframe, dataframe["Chromosome"].unique()[0]) 61 | for dataframe in gene_dataframe_list 62 | ] 63 | return genedata_list 64 | 65 | 66 | if __name__ == "__main__": 67 | parser = argparse.ArgumentParser(description="start to analyze TE Density data") 68 | 69 | parser.add_argument( 70 | "cleaned_gene_annotation", 71 | type=str, 72 | help="path to your cleaned gene annotation (.tsv)", 73 | ) 74 | 75 | parser.add_argument( 76 | "density_data_dir", 77 | type=str, 78 | help="Parent path of folder containing ONLY the TE Density results", 79 | ) 80 | 81 | parser.add_argument( 82 | "chromosome_string", 83 | type=str, 84 | help="regex for chromosome ID, e.g. 'GenomeName_(.*?).h5'", 85 | ) 86 | # NOTE users will need to edit this as needed! 87 | # This is the regular expression that is used to extract the chromosome IDs 88 | # and initialize the DensityData objects. This is specific to the naming of 89 | # your chromosomes in the HDF5 files. 90 | # Please consult the docstring of from_list_gene_data_and_hdf5_dir in 91 | # density_data.py for more information, typically you will only need to 92 | # edit the part of the string before the underscore. Here this was specific 93 | # For example I was working on a genome with the name DN, so my regex rule 94 | # was "DN_(.*?).h5" 95 | 96 | parser.add_argument( 97 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 98 | ) 99 | args = parser.parse_args() 100 | args.cleaned_gene_annotation = os.path.abspath(args.cleaned_gene_annotation) 101 | args.density_data_dir = os.path.abspath(args.density_data_dir) 102 | 103 | # NB just for logging arguments to import_filtered command and DensityData 104 | # initialization 105 | log_level = logging.DEBUG if args.verbose else logging.INFO 106 | logger = logging.getLogger(__name__) 107 | coloredlogs.install(level=log_level) 108 | 109 | # -------------------------------------------------- 110 | # Read cleaned genes for the given genome as pandas 111 | cleaned_genes = import_filtered_genes(args.cleaned_gene_annotation, logger) 112 | 113 | # Get list of GeneData for each genome to enable initialization of 114 | # DensityData 115 | genedata_list = get_gene_data_as_list(cleaned_genes) 116 | 117 | # Initialize DensityData for each genome 118 | # NOTE this object is a list of DensityData instances 119 | processed_dd_data = DensityData.from_list_gene_data_and_hdf5_dir( 120 | genedata_list, args.density_data_dir, args.chromosome_string, logger 121 | ) 122 | 123 | gene_frame_with_indices = add_hdf5_indices_to_gene_data_from_list_hdf5( 124 | cleaned_genes, processed_dd_data 125 | ) 126 | 127 | # NOTE, this adds the indices of the genes in the HDF5 datasets to a pandas 128 | # dataframe, this is used later on to access the density data for each gene 129 | gene_frame_with_hdf5_indices = add_hdf5_indices_to_gene_data_from_list_hdf5( 130 | cleaned_genes, processed_dd_data 131 | ) 132 | 133 | # NOTE, now we can add columns to the pandas dataframe which are the 134 | # density values for a specific TE type and window combo. This right here 135 | # adds a column called "LTR_500_Upstream" to the pandas dataframe, this 136 | # column represents the TE density values for each gene, for the 500 bp 137 | # upstream window. 500 bp window is used here because I use a smaller 138 | # testing set of windows for development. 139 | 140 | # NOTE users can loop this over a list of TE types and window sizes if they 141 | # want 142 | gene_frame_with_te_vals_of_interest = ( 143 | add_te_vals_to_gene_info_pandas_from_list_hdf5( 144 | gene_frame_with_hdf5_indices, 145 | processed_dd_data, 146 | "Order", 147 | "LTR", 148 | "Upstream", 149 | 500, 150 | ) 151 | ) 152 | print(gene_frame_with_te_vals_of_interest) 153 | -------------------------------------------------------------------------------- /examples/Rice_Synteny/src/import_rice_gene_anno.py: -------------------------------------------------------------------------------- 1 | """ 2 | Filter a gene annotation file for the TE Density algorithm 3 | """ 4 | 5 | __author__ = "Scott Teresi" 6 | 7 | import pandas as pd 8 | import argparse 9 | import os 10 | import logging 11 | import coloredlogs 12 | 13 | from transposon import check_nulls 14 | 15 | 16 | def write_cleaned_genes(gene_pandaframe, output_dir, old_filename, logger): 17 | file_name = os.path.join( 18 | output_dir, 19 | ("Cleaned_" + os.path.splitext(os.path.basename(old_filename))[0]) + ".tsv", 20 | ) # MAGIC to get proper extension 21 | 22 | logger.info("Writing cleaned gene file to: %s" % file_name) 23 | gene_pandaframe.to_csv(file_name, sep="\t", header=True, index=True) 24 | 25 | 26 | def import_genes(genes_input_path, logger): 27 | """Import genes file. 28 | 29 | Args: 30 | input_dir (command line argument) Specify the input directory of the gene 31 | annotation data, this is the same as the TE annotation directory 32 | 33 | Returns: 34 | gene_pandaframe (Pandas.Data.Frame): A pandas dataframe of a filtered 35 | GFF file containing hte information needed for the TE Density pipeline. 36 | """ 37 | 38 | col_names = [ 39 | "Chromosome", 40 | "Software", 41 | "Feature", 42 | "Start", 43 | "Stop", 44 | "Score", 45 | "Strand", 46 | "Frame", 47 | "FullName", 48 | ] 49 | 50 | col_to_use = [ 51 | "Chromosome", 52 | "Software", 53 | "Feature", 54 | "Start", 55 | "Stop", 56 | "Strand", 57 | "FullName", 58 | ] 59 | 60 | gene_pandaframe = pd.read_csv( 61 | genes_input_path, 62 | sep="\t+", 63 | header=None, 64 | engine="python", 65 | names=col_names, 66 | usecols=col_to_use, 67 | dtype={ 68 | "Stop": "float64", 69 | "Start": "float64", 70 | "Chromosome": str, 71 | "Strand": str, 72 | "Fullname": str, 73 | "Feature": str, 74 | "Software": str, 75 | }, 76 | comment="#", 77 | ) 78 | 79 | # rows in annotation 80 | gene_pandaframe = gene_pandaframe[ 81 | gene_pandaframe.Feature == "gene" 82 | ] # drop non-gene rows 83 | # gene_pandaframe.reset_index(inplace=True) # reset index so we can have proper 84 | 85 | gene_pandaframe["Gene_Name"] = gene_pandaframe["FullName"].str.extract( 86 | r"gene_id=(.*?);" 87 | ) 88 | gene_pandaframe = gene_pandaframe.drop(columns=["FullName", "Software"]) 89 | gene_pandaframe["Length"] = gene_pandaframe.Stop - gene_pandaframe.Start + 1 90 | 91 | gene_pandaframe.sort_values(by=["Chromosome", "Start"], inplace=True) 92 | # MAGIC I only want the first 12 chromosomes 93 | chromosomes_i_want = [str(i) for i in range(1, 12 + 1)] # MAGIC plus 1 bc range 94 | # NB, chromosomes_i_want must be string 95 | gene_pandaframe = gene_pandaframe.loc[ 96 | gene_pandaframe["Chromosome"].isin(chromosomes_i_want) 97 | ] 98 | gene_pandaframe.sort_values(by=["Chromosome", "Start"], inplace=True) 99 | 100 | check_nulls(gene_pandaframe, logger) 101 | gene_pandaframe = drop_nulls(gene_pandaframe, logger) 102 | # NB 103 | # Throwing away 1 gene because its name is ridiculous and difficult to 104 | # import because of extraneous strings in the entry 105 | # There are '#' characters in the last column of the GFF for a minority of 106 | # genes in both genomes. Will not be modifying the 'comment=#' of the read 107 | # command because that will remove the easy ability to remove the comment 108 | # rows from the annotation. A fix could be performed by removing the # 109 | # characters through another method, but I will just remove the offending 110 | # genes because they aren't central to the example. 111 | 112 | # Set the gene name as the index 113 | gene_pandaframe.set_index("Gene_Name", inplace=True) 114 | return gene_pandaframe 115 | 116 | 117 | def get_nulls(my_df, logger): 118 | """ 119 | Print out the row IDs where the null values exist 120 | 121 | Args: 122 | my_df (Pandaframes): Pandaframe to check null values in 123 | logger 124 | """ 125 | nas = my_df[my_df.isna().any(axis=1)] 126 | logger.warning("Rows where null exist: %s" % nas) 127 | 128 | 129 | def drop_nulls(my_df, logger): 130 | """ 131 | Drop null values inside a Pandaframe 132 | 133 | Args: 134 | my_df (Pandaframes): Pandaframe to drop null values 135 | """ 136 | nas = my_df[my_df.isna().any(axis=1)] 137 | if not nas.empty: 138 | logger.warning("Dropping rows with at least one Null value!") 139 | my_df = my_df.dropna(axis=0, how="any") 140 | return my_df 141 | 142 | 143 | if __name__ == "__main__": 144 | 145 | parser = argparse.ArgumentParser(description="Reformat gene annotation file") 146 | path_main = os.path.abspath(__file__) 147 | dir_main = os.path.dirname(path_main) 148 | output_default = os.path.join( 149 | dir_main, "../../../../", "TE_Data/filtered_input_data" 150 | ) 151 | parser.add_argument( 152 | "gene_input_file", type=str, help="Parent path of gene annotation file" 153 | ) 154 | 155 | parser.add_argument( 156 | "--output_dir", 157 | "-o", 158 | type=str, 159 | default=output_default, 160 | help="Parent directory to output results", 161 | ) 162 | 163 | parser.add_argument( 164 | "-v", "--verbose", action="store_true", help="set debugging level to DEBUG" 165 | ) 166 | 167 | args = parser.parse_args() 168 | args.gene_input_file = os.path.abspath(args.gene_input_file) 169 | args.output_dir = os.path.abspath(args.output_dir) 170 | 171 | log_level = logging.DEBUG if args.verbose else logging.INFO 172 | logger = logging.getLogger(__name__) 173 | coloredlogs.install(level=log_level) 174 | 175 | # Execute 176 | cleaned_genes = import_genes(args.gene_input_file, logger) 177 | write_cleaned_genes(cleaned_genes, args.output_dir, args.gene_input_file, logger) 178 | -------------------------------------------------------------------------------- /tests/unit/test_OverlapData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Unit test OverlapData. 5 | 6 | OverlapData can be used in two modes: outut or input file, SEE `from_param`, `from_file`. 7 | Tests are grouped accordingly. 8 | """ 9 | 10 | __author__ = "Michael Teresi" 11 | 12 | import logging 13 | import os 14 | import pytest 15 | import tempfile 16 | 17 | import coloredlogs 18 | import numpy as np 19 | import pandas as pd 20 | 21 | from transposon.gene_data import GeneData 22 | from transposon.overlap import OverlapData 23 | 24 | N_TRANSPOSONS = 4 25 | WINDOWS = [10, 20] 26 | LOGGER = logging.getLogger(__name__) 27 | coloredlogs.install(level=logging.DEBUG) 28 | 29 | 30 | @pytest.fixture 31 | def gene_data(): 32 | """Default GeneData instance.""" 33 | 34 | return GeneData.mock() 35 | 36 | @pytest.fixture 37 | def temp_dir(): 38 | """Temporary directory.""" 39 | 40 | with tempfile.TemporaryDirectory() as temp_dir: 41 | yield temp_dir 42 | 43 | 44 | @pytest.fixture 45 | def temp_file(): 46 | """Temporary directory.""" 47 | 48 | with tempfile.NamedTemporaryFile(suffix="."+OverlapData.EXT) as temp_file: 49 | yield temp_file.name 50 | 51 | @pytest.fixture 52 | def default_data_out(temp_file): 53 | """Return default output OverlapData instance.""" 54 | 55 | return OverlapData.from_param( 56 | GeneData.mock(), N_TRANSPOSONS, WINDOWS, temp_file, logger=LOGGER 57 | ) 58 | 59 | @pytest.fixture 60 | def active_output(default_data_out): 61 | """Default OverlapData instance for writing.""" 62 | 63 | with default_data_out as active_output: 64 | yield active_output 65 | 66 | @pytest.fixture 67 | def active_input(default_data_out): 68 | """Default OverlapData instance for reading.""" 69 | 70 | filepath = None 71 | with default_data_out as io: 72 | filepath = io.filepath 73 | with OverlapData.from_file(filepath) as io: 74 | yield io 75 | 76 | @pytest.fixture 77 | def serialized_deserialized(default_data_out): 78 | """Yield an overlap written to disk and one read from the first.""" 79 | 80 | filepath = None 81 | with default_data_out as output: 82 | # MAGIC NUMBER dummy data 83 | output.left[:] = np.ones(output.left.shape) * 2 84 | output.right[:] = np.ones(output.right.shape) * 3 85 | output.intra[:] = np.ones(output.intra.shape) * 4 86 | output._h5_file.flush() 87 | filepath = output.filepath 88 | with OverlapData.from_file(filepath) as input: 89 | yield (input, output) 90 | 91 | def test_from_param_raise(gene_data, temp_file): 92 | """Does the from param factory raise?""" 93 | 94 | OverlapData.from_param(gene_data, N_TRANSPOSONS, WINDOWS, temp_file, logger=LOGGER) 95 | 96 | def test_from_param_raise_enter_exit(active_output): 97 | """Does the context manager raise for an output file?""" 98 | 99 | pass 100 | 101 | def test_open_dispatch_bad(temp_dir): 102 | """Does the open dispatch raise on an invalid config?""" 103 | 104 | class DummyClass(): 105 | pass 106 | od = OverlapData(DummyClass()) 107 | with pytest.raises(TypeError) as excinfo: 108 | od._open_dispatcher() 109 | 110 | def test_open_dispatch_sink(active_output): 111 | """Does the open dispatch create a file?""" 112 | 113 | assert os.path.isfile(active_output._h5_file.filename) 114 | 115 | def _test_open_dispatch_source(temp_dir): 116 | """Does the open dispatch call the source initializer?""" 117 | 118 | raise NotImplementedError() 119 | 120 | def test_from_file_raise_valid(default_data_out): 121 | """Does the from file factory raise for valid data?""" 122 | 123 | filepath = None 124 | with default_data_out as io: 125 | filepath = io.filepath 126 | overlap_data = OverlapData.from_file(filepath, LOGGER) 127 | 128 | def test_from_file_raise(temp_dir): 129 | """Does the from file factory raise for invalid data?""" 130 | 131 | with pytest.raises(ValueError) as excinfo: 132 | overlap_data = OverlapData.from_file('not a file') 133 | 134 | def test_from_file_raise_enter_exit(default_data_out): 135 | """Does the context manager raise?""" 136 | 137 | filepath = None 138 | with default_data_out as io: 139 | filepath = io.filepath 140 | with OverlapData.from_file(filepath) as io: 141 | assert io.filepath == filepath 142 | 143 | def test_left_right_shape(active_output): 144 | """Do the shapes match for left / right overlap?""" 145 | 146 | assert np.all(active_output.left.shape == active_output.right.shape) 147 | 148 | def test_io_windows(serialized_deserialized): 149 | """Can it serialize / deserialize the windows?""" 150 | 151 | input, output = serialized_deserialized 152 | assert input.windows == output.windows 153 | 154 | def test_io_genes(serialized_deserialized): 155 | """Can it serialize / deserialize the gene names?""" 156 | 157 | input, output = serialized_deserialized 158 | assert input.gene_names == output.gene_names 159 | 160 | def test_io_chromosome_id(serialized_deserialized): 161 | """Can it serialize / deserialize the chromosome id?""" 162 | 163 | input, output = serialized_deserialized 164 | assert input.chromosome_id == output.chromosome_id 165 | 166 | def test_io_chromosome_id(serialized_deserialized): 167 | """Can it serialize / deserialize the chromosome id?""" 168 | 169 | input, output = serialized_deserialized 170 | assert input.genome_id == output.genome_id 171 | 172 | def test_io_left(serialized_deserialized): 173 | """Can it serialize / deserialize the left overlap""" 174 | 175 | input, output = serialized_deserialized 176 | assert np.all(input.left == output.left) 177 | 178 | def test_io_intra(serialized_deserialized): 179 | """Can it serialize / deserialize the intra overlap?""" 180 | 181 | input, output = serialized_deserialized 182 | assert np.all(input.intra == output.intra) 183 | 184 | def test_io_right(serialized_deserialized): 185 | """Can it serialize / deserialize the right overlap?""" 186 | 187 | input, output = serialized_deserialized 188 | assert np.all(input.right == output.right) 189 | 190 | # TODO test slicing 191 | 192 | if __name__ == "__main__": 193 | pytest.main(['-s', __file__]) # for convenience 194 | -------------------------------------------------------------------------------- /tests/unit/test_DensityData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | 3 | """ 4 | Unit test DensityData 5 | """ 6 | 7 | __author__ = "Scott Teresi" 8 | 9 | import h5py 10 | import numpy as np 11 | import pytest 12 | import logging 13 | import coloredlogs 14 | 15 | from transposon import write_vlen_str_h5py 16 | from transposon.density_data import DensityData 17 | from transposon.import_filtered_genes import import_filtered_genes 18 | from transposon.gene_data import GeneData 19 | 20 | LOGGER = logging.getLogger(__name__) 21 | coloredlogs.install(level=logging.DEBUG) 22 | TEST_GENE_FILE = "tests/input_data/Test_Genes_DensityData.tsv" 23 | 24 | 25 | @pytest.fixture 26 | def chromosomes(): 27 | """Return a list of chromosomes for h5py file""" 28 | return ["TestChrom1"] 29 | 30 | 31 | @pytest.fixture 32 | def gene_names(): 33 | """Yield a list of genes""" 34 | gene_names = [ 35 | "dummy1", 36 | "dummy2", 37 | "dummy3", 38 | "dummy4", 39 | "dummy5", 40 | "dummy6", 41 | "dummy7", 42 | "dummy8", 43 | ] 44 | return gene_names 45 | 46 | 47 | @pytest.fixture 48 | def dummy_gene_data(): 49 | """Yield a GeneData for testing""" 50 | gene_pandas = import_filtered_genes(TEST_GENE_FILE, LOGGER) 51 | return GeneData(gene_pandas, "dummy_genome_name", LOGGER) 52 | 53 | 54 | @pytest.fixture 55 | def order_names(): 56 | """Return a list of order types""" 57 | order_names = ["LTR", "TIR"] 58 | return order_names 59 | 60 | 61 | @pytest.fixture 62 | def super_names(): 63 | """Return a list of super names""" 64 | super_names = ["Copia", "Gypsy", "HAT"] 65 | return super_names 66 | 67 | 68 | @pytest.fixture 69 | def windows(): 70 | """Return a list of window values""" 71 | windows = ["500", "1000", "1500"] 72 | return windows 73 | 74 | 75 | @pytest.fixture 76 | def total_orders(order_names): 77 | """Return an integer which is the total number of orders""" 78 | return sum(1 for order in order_names) 79 | 80 | 81 | @pytest.fixture 82 | def total_windows(windows): 83 | """Return an integer which is the total number of windows""" 84 | return sum(1 for window in windows) 85 | 86 | 87 | @pytest.fixture 88 | def total_genes(gene_names): 89 | """Return an integer which is the total number of genes""" 90 | return sum(1 for gene in gene_names) 91 | 92 | 93 | @pytest.fixture 94 | def rho_o_left(total_orders, total_windows, total_genes): 95 | """Return an array of order left values""" 96 | # NOTE currently it will perform np.arange(48) and reshape to (2,3,8) 97 | matrix_num = total_orders * total_windows * total_genes 98 | arr = np.arange(matrix_num).reshape((total_orders, total_windows, total_genes)) 99 | return arr 100 | 101 | 102 | @pytest.fixture 103 | def rho_o_intra(total_orders, total_genes): 104 | """Return an array of order left values""" 105 | # NOTE currently it will perform np.arange(48) 106 | matrix_num = total_orders * total_genes 107 | arr = np.arange(matrix_num).reshape((total_orders, 1, total_genes)) 108 | return arr 109 | 110 | 111 | @pytest.fixture 112 | def rho_o_right(total_orders, total_windows, total_genes): 113 | """Return an array of order left values""" 114 | # NOTE currently it will perform np.arange(48) *2 starting from 48 115 | # so that we may differentiate from rho o left, and reshape to (2,3,8) 116 | # All this does is make the values different, not the shape 117 | matrix_num = total_orders * total_windows * total_genes * 2 118 | arr = np.arange(48, matrix_num).reshape((total_orders, total_windows, total_genes)) 119 | return arr 120 | 121 | 122 | @pytest.fixture 123 | def density_data_test_obj_swap_vals( 124 | chromosomes, 125 | dummy_gene_data, 126 | order_names, 127 | super_names, 128 | windows, 129 | rho_o_left, 130 | rho_o_intra, 131 | rho_o_right, 132 | ): 133 | """Create a test object for DensityData, reads from file""" 134 | # TODO set path to be a variable not hard-coded, plus it repeats further 135 | # down 136 | f = h5py.File( 137 | "tests/input_data/test_swap_file.h5", 138 | "w", 139 | ) 140 | gene_names = list(dummy_gene_data.names) 141 | write_vlen_str_h5py(f, chromosomes, "CHROMOSOME_ID") 142 | write_vlen_str_h5py(f, gene_names, "GENE_NAMES") 143 | write_vlen_str_h5py(f, order_names, "ORDER_NAMES") 144 | write_vlen_str_h5py(f, super_names, "SUPERFAMILY_NAMES") 145 | write_vlen_str_h5py(f, windows, "WINDOWS") 146 | 147 | dset = f.create_dataset("RHO_ORDERS_LEFT", data=rho_o_left) 148 | dset = f.create_dataset("RHO_ORDERS_INTRA", data=rho_o_intra) 149 | dset = f.create_dataset("RHO_ORDERS_RIGHT", data=rho_o_right) 150 | 151 | # NB just re-doing the values for the supers because not testing supers 152 | dset = f.create_dataset("RHO_SUPERFAMILIES_LEFT", data=rho_o_left) 153 | dset = f.create_dataset("RHO_SUPERFAMILIES_INTRA", data=rho_o_intra) 154 | f.create_dataset("RHO_SUPERFAMILIES_RIGHT", data=rho_o_right) 155 | f.close() 156 | return DensityData( 157 | "tests/input_data/test_swap_file.h5", 158 | dummy_gene_data, 159 | LOGGER, 160 | sense_swap=False, 161 | ) 162 | 163 | 164 | def test_swap_density_vals(density_data_test_obj_swap_vals): 165 | """Test whether or not left and right density values are swapped 166 | correctly""" 167 | # Set values for easy testing and checking 168 | # Shape of left/right orders is (no. of TE types, window, no. genes). 169 | # Here we do index 1, which corresponds to Gene2 170 | 171 | density_data_test_obj_swap_vals.left_orders[:, :, 1] = 100 172 | density_data_test_obj_swap_vals.right_orders[:, :, 1] = 200 173 | 174 | # Call the value swapper, manually, because it was set to NOT do it upon 175 | # initialization 176 | density_data_test_obj_swap_vals._swap_strand_vals(["dummy2"]) 177 | 178 | # Check the values 179 | assert np.array_equal( 180 | density_data_test_obj_swap_vals.left_orders[:, :, 1], np.full((2, 3), 200) 181 | ) 182 | assert np.array_equal( 183 | density_data_test_obj_swap_vals.right_orders[:, :, 1], np.full((2, 3), 100) 184 | ) 185 | 186 | 187 | if __name__ == "__main__": 188 | pytest.main(["-s", __file__]) # for convenience 189 | -------------------------------------------------------------------------------- /transposon/gene_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Wrappers for input data, multiple genes. 5 | 6 | Used to provide a common interface and fast calculations with numpy. 7 | """ 8 | 9 | __author__ = "Michael Teresi, Scott Teresi" 10 | 11 | import logging 12 | import numpy as np 13 | import pandas as pd 14 | 15 | from transposon.gene_datum import GeneDatum 16 | 17 | 18 | class GeneData(object): 19 | """Wraps a data frame containing many genes. 20 | Provides an interface, attribute access, and to/from disk functionality. 21 | 22 | Note the numpy views are not necessarily a no-copy (SEE pandas.DataFrame.to_numpy). 23 | 24 | Expects certain column identifiers (SEE self.__init__). 25 | GeneData subclasses should conform to these column names or redefine the properties. 26 | """ 27 | 28 | def __init__(self, gene_dataframe, genome_id, logger=None): 29 | """Initialize. 30 | 31 | Args: 32 | gene_dataframe (DataFrame): gene data frame. 33 | genome_id (str): a string of the genome name. 34 | """ 35 | 36 | self._logger = logger or logging.getLogger(__name__) 37 | self.data_frame = gene_dataframe.copy(deep=True) 38 | self._names = self.data_frame.index # names of genes list(str) 39 | self.starts = self.data_frame.Start.to_numpy(copy=False) 40 | self.stops = self.data_frame.Stop.to_numpy(copy=False) 41 | self.lengths = self.data_frame.Length.to_numpy(copy=False) 42 | self.chromosomes = self.data_frame.Chromosome.to_numpy(copy=False) 43 | self.genome_id = genome_id 44 | self.add_genome_id() 45 | 46 | @classmethod 47 | def mock( 48 | cls, 49 | start_stop=np.array([[0, 9], [10, 19], [20, 29]]), 50 | genome_id="fake_genome_id", 51 | ): 52 | """Mocked data for testing. 53 | 54 | Args: 55 | start_stop (numpy.array): N gene x (start_idx, stop_idx) 56 | """ 57 | 58 | n_genes = start_stop.shape[0] 59 | data = [] 60 | for gi in range(n_genes): 61 | g0 = start_stop[gi, 0] 62 | g1 = start_stop[gi, 1] 63 | gL = g1 - g0 + 1 64 | name = "gene_{}".format(gi) 65 | chromosome = genome_id 66 | datum = [name, g0, g1, gL, chromosome] 67 | data.append(datum) 68 | 69 | col_names = ["Gene_Name", "Start", "Stop", "Length", "Chromosome"] 70 | frame = pd.DataFrame(data, columns=col_names) 71 | frame.set_index("Gene_Name", inplace=True) 72 | return GeneData(frame, genome_id) 73 | 74 | @classmethod 75 | def mock_v2( 76 | cls, 77 | start_stop=np.array([[500, 1000], [800, 1500], [1600, 2000]]), 78 | chromosome_ids=["Chrom_1", "Chrom_2", "Chrom_3"], 79 | genome_id="fake_genome_id", 80 | ): 81 | """Mocked data for testing. TODO refactor with main mock. 82 | 83 | Args: 84 | start_stop (numpy.array): N gene x (start_idx, stop_idx) 85 | chromosome_id (list): List of string, len(N gene) 86 | """ 87 | 88 | n_genes = start_stop.shape[0] 89 | data = [] 90 | for gi in range(n_genes): 91 | g0 = start_stop[gi, 0] 92 | g1 = start_stop[gi, 1] 93 | gL = g1 - g0 + 1 94 | name = "gene_{}".format(gi) 95 | chromosome_id = chromosome_ids[gi] 96 | datum = [name, g0, g1, gL, chromosome_id] 97 | data.append(datum) 98 | 99 | col_names = ["Gene_Name", "Start", "Stop", "Length", "Chromosome"] 100 | frame = pd.DataFrame(data, columns=col_names) 101 | frame.set_index("Gene_Name", inplace=True) 102 | return GeneData(frame, genome_id) 103 | 104 | def write(self, filename): 105 | """Write a Pandaframe to disk. 106 | 107 | Args: 108 | filename (str): a string of the filename to write. 109 | 110 | """ 111 | # Begin refactor 112 | self.data_frame.to_csv(filename, sep="\t", header=True, index=True) 113 | 114 | @classmethod 115 | def read(cls, filename): 116 | """Read from disk. Returns a wrapped Pandaframe from an hdf5 file 117 | 118 | Args: 119 | filename (str): a string of the filename to write. 120 | """ 121 | # NOTE this is a little duplicate with import_filtered_genes 122 | # NOTE I don't have logger obj here, so talk to Mike if we just want to have 123 | # duplicate code. 124 | data_frame = pd.read_csv( 125 | filename, 126 | header="infer", 127 | sep="\t", 128 | index_col="Gene_Name", 129 | dtype={ 130 | "Start": "float64", 131 | "Stop": "float64", 132 | "Length": "float64", 133 | "Chromosome": str, 134 | "Strand": str, 135 | "Feature": str, 136 | "Genome_ID": str, 137 | }, 138 | ) 139 | data_frame.sort_values(by=["Chromosome", "Start"], inplace=True) 140 | genome_id_list = data_frame["Genome_ID"].unique().tolist() 141 | if not genome_id_list: 142 | raise RuntimeError("column 'Genome_ID' is empty") 143 | elif len(genome_id_list) > 1: 144 | raise RuntimeError("Genome IDs are are not unique: %s" % genome_id_list) 145 | else: 146 | genome_id = genome_id_list[0] # MAGIC NUMBER list to string 147 | 148 | data_frame.drop(columns=["Genome_ID"], inplace=True) # NOTE, have to do 149 | # this to avoid it getting the column twice, perhaps refactor? 150 | 151 | new_instance = cls(data_frame, genome_id) 152 | return new_instance 153 | 154 | def get_gene(self, gene_id): 155 | """Return a GeneDatum for the gene identifier.""" 156 | 157 | return GeneDatum(self.data_frame, gene_id) 158 | 159 | def add_genome_id(self): 160 | """Add the genome_id as an extra column to the gene_dataframe""" 161 | self.data_frame["Genome_ID"] = self.genome_id 162 | 163 | @property 164 | def names(self): 165 | """Yields the names for each gene.""" 166 | 167 | return (name for name in self._names) 168 | 169 | @property 170 | def chromosome_unique_id(self): 171 | """Unique chromosome identifier for all the genes available. 172 | 173 | This will raise if the genes are not from the same chromosome, 174 | for example you you didn't split the dataset wrt this data. 175 | 176 | Returns: 177 | str: the unique identifier. 178 | Raises: 179 | RuntimeError: if multiple chromosomes are in the data frame (i.e. no unique). 180 | """ 181 | 182 | chromosome_list = self.data_frame.Chromosome.unique().tolist() 183 | if not chromosome_list: 184 | raise RuntimeError("column 'Chromosome' is empty") 185 | elif len(chromosome_list) > 1: 186 | raise RuntimeError("chromosomes are not unique: %s" % chromosome_list) 187 | else: 188 | return chromosome_list[0] # MAGIC NUMBER list to string 189 | 190 | def __repr__(self): 191 | """String representation for developer.""" 192 | 193 | return "GeneData{}".format(self.data_frame) 194 | -------------------------------------------------------------------------------- /transposon/verify_cache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Contains methods for verifying various cached input data. 5 | """ 6 | 7 | __author__ = "Scott Teresi" 8 | 9 | import os 10 | import sys 11 | import pandas as pd 12 | from transposon.revise_annotation import ReviseAnno 13 | from transposon.import_filtered_genes import import_filtered_genes 14 | from transposon.import_filtered_TEs import import_filtered_TEs 15 | 16 | 17 | def verify_chromosome_h5_cache( 18 | gene_data_obj, 19 | te_data_obj, 20 | g_filepath, 21 | t_filepath, 22 | reset_h5, # TODO edit this later 23 | cache_location, 24 | genes_input_file, 25 | tes_input_file, 26 | chrom_id, 27 | logger, 28 | ): 29 | """Determine whether or not previously saved gene_data and TransposonData 30 | exist in .tsv format. Each file represents either gene_data or 31 | TransposonData for one chromosome at a time. Save/update files as 32 | necessary. 33 | 34 | When are the tsv cache files written? 35 | 1. Files will be written if there are no current corresponding files 36 | saved on disk. 37 | 2. If a command-line option is passed to density.py to re-write the 38 | files, this option defaults to NOT re-write. 39 | # TODO check 40 | 3. If enough time has passed between the creation of the H5 file and the 41 | current run-time of the program. TODO talk to Michael more about this. 42 | 43 | Args: 44 | gene_data_obj (gene_data): Instance of gene_data 45 | te_data_obj (TransposonData): Instance of TransposonData 46 | g_filepath (str): The string of the filename in which to save the 47 | gene_data as an H5 file. 48 | t_filepath (str): The string of the filename in which to save the 49 | TransposonData as an H5 file. 50 | reset_h5 (bool): Boolean, whether or not to completely rewrite the 51 | cache of all H5 files. True means that we will rewrite. 52 | cache_location (str): The location (file path) in which to store the 53 | cache gene and TE data files. 54 | genes_input_file (str): The file path of the file that was used to 55 | generate the gene_data instance. 56 | tes_input_file (str): The file path of the file that was used to 57 | generate the TransposonData instance. 58 | chrom_id (str): A string representation of the current chromosome. Used 59 | to name each H5 file. 60 | """ 61 | if reset_h5: 62 | logger.info("overwrite: %s" % g_filepath) 63 | logger.info("overwrite: %s" % t_filepath) 64 | gene_data_obj.write(g_filepath) 65 | te_data_obj.write(t_filepath) 66 | 67 | if os.path.exists(g_filepath) and os.path.exists(t_filepath): 68 | gene_annot_time = os.path.getmtime(genes_input_file) 69 | te_annot_time = os.path.getmtime(tes_input_file) 70 | gene_h5_time = os.path.getmtime(g_filepath) 71 | te_h5_time = os.path.getmtime(t_filepath) 72 | 73 | if (gene_annot_time > gene_h5_time) and (te_annot_time > te_h5_time): 74 | logger.info("cache is too old for chromosome '%s'" % chrom_id) 75 | logger.info("write: %s" % g_filepath) 76 | logger.info("write: %s" % t_filepath) 77 | gene_data_obj.write(g_filepath) 78 | te_data_obj.write(t_filepath) 79 | 80 | elif (gene_annot_time < gene_h5_time) and (te_annot_time < te_h5_time): 81 | # No need to re-write a current cache 82 | return 83 | 84 | elif reset_h5 or (not (os.path.exists(g_filepath) and os.path.exists(t_filepath))): 85 | gene_data_obj.write(g_filepath) 86 | te_data_obj.write(t_filepath) 87 | else: 88 | logger.critical( 89 | """During the verification of the H5 cache nothing was 90 | saved because 0 conditions were met.""" 91 | ) 92 | 93 | 94 | def verify_TE_cache(tes_input_file, logger): 95 | """Read a preprocessed/filtered TE annotation file from disk; return a 96 | pandaframe of the file, no modifications are made to the data. 97 | 98 | Args: 99 | tes_input_file (str): A command line argument, this is the location 100 | of the processed TE annotation file. 101 | 102 | Returns: 103 | te_data (pandas.DataFrame): A pandas dataframe of the TE data 104 | """ 105 | logger.info("Reading pre-filtered TE annotation file: %s" % tes_input_file) 106 | te_data = import_filtered_TEs(tes_input_file, logger) 107 | return te_data 108 | 109 | 110 | def verify_gene_cache(genes_input_file, logger): 111 | """Read a preprocessed/filtered gene annotation file from disk; return a 112 | pandaframe of the file, no modifications are made to the data. 113 | 114 | Args: 115 | genes_input_file (str): A command line argument, this is the location 116 | of the processed gene annotation file. 117 | 118 | Returns: 119 | gene_data (pandas.DataFrame): the gene data container 120 | """ 121 | logger.info("Reading pre-filtered gene annotation file: %s" % genes_input_file) 122 | gene_data = import_filtered_genes(genes_input_file, logger) 123 | return gene_data 124 | 125 | 126 | def revise_annotation( 127 | te_data, revise_anno, revised_transposons_loc, revised_cache_loc, logger, genome_id 128 | ): 129 | """Remove overlapping elements of the same type. 130 | 131 | Revises the annotation so that elements of the same type do not overlap at 132 | all. Will essentially merge elements together, elongating them. This is 133 | done so that the mathematics of density make sense. You can elect to not 134 | use the revised annotation through a command-line argument to density.py, 135 | however given that TEs often overlap with one another in annotatios (not 136 | just being nested in one another) it can lead to some difficulties in 137 | accurately assessing density and obfuscate the density results. 138 | 139 | Args: 140 | te_data (pandas.core.DataFrame): A PandaFrame of the TE data, 141 | previously imported from raw and filtered or imported from a previously 142 | filtered data file that was saved to disk. 143 | 144 | revise_anno (bool): A boolean of whether or not to use/create a revised 145 | annotation 146 | 147 | revised_transposons (str): A string representing the path of a 148 | previously filtered (cleaned) and revised TE annotation. 149 | 150 | revised_cache_loc (): Directory for output files 151 | 152 | logger (): 153 | 154 | genome_id (str): String of the genome ID 155 | 156 | Returns: 157 | te_data (pandaframe): A pandas dataframe of the revised TE data 158 | """ 159 | 160 | if os.path.exists(revised_transposons_loc) and not revise_anno: 161 | logger.info("load revised TE: %s" % revised_transposons_loc) 162 | te_data = import_filtered_TEs(revised_transposons_loc, logger) 163 | else: 164 | logger.info("creating revised TE dataset...") 165 | logger.info("revising the TE dataset will take a long time!") 166 | # N.B we want higher recursion limit for the code 167 | sys.setrecursionlimit(11 ** 6) 168 | revised_te_data = ReviseAnno( 169 | te_data, revised_transposons_loc, revised_cache_loc, genome_id 170 | ) 171 | revised_te_data.create_superfam() 172 | revised_te_data.create_order() 173 | revised_te_data.create_nameless() 174 | revised_te_data.verify_files() 175 | te_data = revised_te_data.whole_te_annotation 176 | return te_data 177 | --------------------------------------------------------------------------------