├── .documentation.yml ├── .eslintrc.js ├── .github └── workflows │ └── build-and-deploy.yml ├── .gitignore ├── .prettierignore ├── .prettierrc ├── CHANGELOG.md ├── README.md ├── dist ├── assets │ ├── deflate-4cf9f098-2cfae813.js │ ├── html2canvas.esm-e0a7d97b.js │ ├── index-3c0b6e34.js │ ├── index-7a646334.css │ ├── index.es-72b99e69.js │ ├── jpeg-e220efb2-2c6767e9.js │ ├── jspdf.es.min-a0860ec0.js │ ├── lerc-12b63f0c-ae350862.js │ ├── lzw-68593c7a-14a83b0e.js │ ├── packbits-c500b823-7261db87.js │ ├── pako.esm-68f84e2a-fb87a468.js │ ├── purify.es-f47f2ec0.js │ ├── raw-d95d6509-33ef364a.js │ └── webimage-7938b145-12f2a36a.js └── index.html ├── docs └── Introduction.md ├── index.html ├── notebooks ├── .gitignore ├── README.md ├── data_processing_20200429 │ ├── README.md │ └── process_chr22_dataset.ipynb ├── data_processing_20200520 │ └── create_large_multivec.ipynb └── environment.yml ├── package.json ├── pipelines ├── cistrome-table │ └── row-info-to-table.py ├── cistrome-to-multivec │ ├── .gitignore │ ├── README.md │ ├── Snakefile │ ├── cluster-profile.yml │ ├── config.yml │ ├── environment.yml │ ├── generate_config.py │ ├── higlass_ingest.sh │ ├── inspect.ipynb │ ├── src │ │ ├── bigwigs_to_manifest.py │ │ ├── manifest_to_mv5.py │ │ ├── manifest_to_zarr.py │ │ └── utils.py │ └── submit.sh └── mira-data │ ├── .gitignore │ ├── README.md │ ├── cliff_code.py │ ├── environment.yml │ ├── max_topic.js │ ├── mm10.chrom.sizes │ ├── mouse_brain_tutorial.ipynb │ ├── process.py │ └── rowInfo.ipynb ├── src ├── ContextMenu.jsx ├── ContextMenu.scss ├── GeneExpressionSelection.jsx ├── GeneExpressionSelection.scss ├── GwasFilter.jsx ├── HiGlassMeta.jsx ├── HiGlassMetaConsumer.jsx ├── HiGlassMetaConsumer.scss ├── RangeSlider.jsx ├── RangeSlider.scss ├── Tooltip.jsx ├── Tooltip.scss ├── TrackRowFilter.jsx ├── TrackRowFilter.scss ├── TrackRowHighlight.jsx ├── TrackRowInfo.jsx ├── TrackRowInfoControl.jsx ├── TrackRowInfoVis.jsx ├── TrackRowInfoVisBand.jsx ├── TrackRowInfoVisDendrogram.jsx ├── TrackRowInfoVisExpression.jsx ├── TrackRowInfoVisLink.jsx ├── TrackRowInfoVisNominalBar.jsx ├── TrackRowInfoVisNominalDynamic.jsx ├── TrackRowInfoVisQuantitativeBar.jsx ├── TrackRowInfoVisTree.jsx ├── TrackRowZoomOverlay.jsx ├── TrackRowZoomOverlay.scss ├── TrackWrapper.jsx ├── ViewColumnBrush.jsx ├── ViewColumnBrush.scss ├── ViewWrapper.jsx ├── ViewWrapper.scss ├── cistrome-api │ ├── bigwig.js │ └── cistrome-track.js ├── demo │ ├── CistromeExplorer.jsx │ ├── CistromeExplorer.scss │ ├── CistromeToolkit.jsx │ ├── CistromeToolkit.scss │ ├── DataTable.jsx │ ├── DataTable.scss │ ├── demo.js │ ├── fakedata │ │ ├── cistrome-track-1 │ │ │ └── rowInfo.json │ │ ├── cistrome-track-10 │ │ │ └── rowInfo.json │ │ ├── cistrome-track-2 │ │ │ └── rowInfo.json │ │ ├── cistrome-track-3 │ │ │ └── rowInfo.json │ │ ├── cistrome-track-3k27 │ │ │ ├── rowInfo.json │ │ │ ├── rowInfoRevision.json │ │ │ └── rowInfoWithQc.json │ │ ├── cistrome-track-3k4 │ │ │ ├── rowInfo.json │ │ │ └── rowInfoRevision.json │ │ ├── cistrome-track-atac │ │ │ ├── rowInfo.json │ │ │ ├── rowInfoRevision.json │ │ │ └── rowInfoWithQc.json │ │ ├── index.js │ │ └── mira-track-mouse │ │ │ ├── process.py │ │ │ ├── rowInfo.json │ │ │ ├── rowInfo4000.json │ │ │ ├── rowInfo4000AllData.json │ │ │ ├── rowInfo500Smooth.json │ │ │ ├── rowInfo500SmoothTree.json │ │ │ ├── rowInfoWithCategory.json │ │ │ └── rowInfoWithCategoryAllData.json │ ├── index.jsx │ └── index.scss ├── index.js ├── scale-legend │ └── ScaleLegendTrack.js ├── utils │ ├── aggregate.js │ ├── array.js │ ├── canvas.js │ ├── chromsizes.js │ ├── cistrome.js │ ├── color.js │ ├── constants.js │ ├── contexts.jsx │ ├── d3.js │ ├── genome.js │ ├── gwas.js │ ├── icons.js │ ├── layout.js │ ├── layout.spec.js │ ├── linking.js │ ├── options.js │ ├── options.spec.js │ ├── select-rows.js │ ├── select-rows.spec.js │ ├── toolkit.js │ ├── tree.js │ ├── tree.spec.js │ ├── two.js │ ├── view-history.js │ ├── viewconf.js │ ├── viewconf.spec.js │ ├── viewport.js │ ├── vis.js │ ├── visualization-properties.js │ └── wrap-svg.js └── viewconfigs │ ├── horizontal-multivec-1.js │ ├── horizontal-multivec-10.json │ ├── horizontal-multivec-11.json │ ├── horizontal-multivec-1b.json │ ├── horizontal-multivec-2.json │ ├── horizontal-multivec-2b.json │ ├── horizontal-multivec-3.json │ ├── horizontal-multivec-3k27-revision.js │ ├── horizontal-multivec-3k27.js │ ├── horizontal-multivec-3k4-revision.js │ ├── horizontal-multivec-3k4.js │ ├── horizontal-multivec-4.json │ ├── horizontal-multivec-5.json │ ├── horizontal-multivec-6.json │ ├── horizontal-multivec-7.json │ ├── horizontal-multivec-8.json │ ├── horizontal-multivec-9.json │ ├── horizontal-multivec-atac-revision.js │ ├── horizontal-multivec-atac.js │ ├── horizontal-multivec-mira-mouse.js │ └── meeting-2020-04-29.json ├── vite.config.js └── yarn.lock /.documentation.yml: -------------------------------------------------------------------------------- 1 | toc: 2 | - name: Introduction 3 | file: docs/Introduction.md 4 | - name: React Components 5 | - HiGlassMeta 6 | - name: React Components (internal) 7 | - HiGlassMetaConsumer 8 | - InfoProvider 9 | - DataTable 10 | - CistromeToolkit 11 | - ContextMenu 12 | - Tooltip 13 | - TooltipContent 14 | - TrackRowHighlight 15 | - TrackRowInfo 16 | - TrackRowInfoControl 17 | - TrackRowInfoVis 18 | - TrackRowInfoVisDendrogram 19 | - TrackRowInfoVisLink 20 | - TrackRowInfoVisNominalBar 21 | - TrackRowInfoVisQuantitativeBar 22 | - TrackRowFilter 23 | - TrackRowZoomOverlay 24 | - RangeSlider 25 | - TrackWrapper 26 | - ViewWrapper 27 | - ViewColumnBrush 28 | - SuggestionWithHighlight 29 | - name: Functions (internal) 30 | - getConditionFromHighlightOption 31 | - getHighlightKeyByFieldType 32 | - addTrackWrapperOptions 33 | - validateWrapperOptions 34 | - processWrapperOptions 35 | - updateWrapperOptions 36 | - getWrapperSubOptions 37 | - getTrackWrapperOptions 38 | - getRetinaRatio 39 | - traverseViewConfig 40 | - addTrackDefToViewConfig 41 | - getTrackDefFromViewConfig 42 | - getHMTrackIdsFromViewConfig 43 | - getSiblingVPHTrackIdsFromViewConfig 44 | - updateViewConfigOnSelectGenomicInterval 45 | - removeViewportFromViewConfig 46 | - updateViewConfigOnSelectRowsByTrack 47 | - getHMSelectedRowsFromViewConfig 48 | - getUniqueViewOrTrackId 49 | - getAllViewAndTrackPairs 50 | - resolveIntervalCoordinates 51 | - selectRows 52 | - highlightRowsFromSearch 53 | - getAggregatedRowInfo 54 | - getAggregatedValue 55 | - getNumOfTracks 56 | - insertItemToArray 57 | - modifyItemInArray 58 | - removeItemFromArray 59 | - drawRowHighlightRect 60 | - drawVisTitle 61 | - matrixToTree 62 | - matrixToTreeWithDistance 63 | - createReducer 64 | - getReadableTable 65 | - validateIntervalParams 66 | - validateGeneParams 67 | - makeDBToolkitIntervalAPIURL 68 | - makeDBToolkitGeneAPIURL 69 | - requestByInterval 70 | - requestByGene 71 | - componentToHex 72 | - rgbToHex 73 | - generateNextUniqueColor 74 | - getRange 75 | - wrapSvg 76 | - name: Classes (internal) 77 | - Two 78 | - TwoRectangle 79 | - TwoCircle 80 | - TwoLine 81 | - TwoPath 82 | - TwoText 83 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "env": { 3 | "browser": true, 4 | "es2021": true, 5 | jest: true 6 | }, 7 | "extends": [ 8 | "eslint:recommended", 9 | "plugin:react/recommended" 10 | ], 11 | "parserOptions": { 12 | "ecmaFeatures": { 13 | "jsx": true 14 | }, 15 | "ecmaVersion": 13, 16 | "sourceType": "module" 17 | }, 18 | "plugins": [ 19 | "react", 20 | "jest" 21 | ], 22 | settings: { 23 | react: { 24 | version: "detect" 25 | } 26 | }, 27 | "rules": { 28 | "indent": [ 29 | "error", 30 | 4 31 | ], 32 | "react/prop-types": 0, 33 | "no-unused-vars": "warn", 34 | "no-prototype-builtins": "warn", 35 | "no-constant-condition": 0, 36 | "react/display-name": 0, 37 | "linebreak-style": [ 38 | "error", 39 | "unix" 40 | ], 41 | "quotes": [ 42 | "error", 43 | "double" 44 | ], 45 | "semi": [ 46 | "error", 47 | "always" 48 | ] 49 | } 50 | }; 51 | -------------------------------------------------------------------------------- /.github/workflows/build-and-deploy.yml: -------------------------------------------------------------------------------- 1 | name: Build and deploy doc and App 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | 8 | jobs: 9 | build: 10 | name: Test and build 11 | 12 | runs-on: ubuntu-latest 13 | 14 | strategy: 15 | matrix: 16 | node-version: [ 16.x, 18.x ] 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | - name: Use Node.js ${{ matrix.node-version }} 21 | uses: actions/setup-node@v3 22 | with: 23 | node-version: ${{ matrix.node-version }} 24 | - run: yarn install 25 | - run: yarn test 26 | - run: yarn build 27 | 28 | deploy: 29 | name: Deploy 30 | 31 | runs-on: ubuntu-latest 32 | 33 | needs: [ build ] 34 | 35 | if: github.ref == 'refs/heads/master' 36 | 37 | steps: 38 | - uses: actions/checkout@v3 39 | - uses: actions/setup-node@v3 40 | with: 41 | node-version: 16 42 | - run: yarn install 43 | - run: yarn build 44 | - run: yarn docs 45 | - run: mv ./build-docs ./build-demo/docs 46 | - name: Deploy App 47 | run: | 48 | git config --global user.name "action@github.com" 49 | git config --global user.email "GitHub Action" 50 | git remote set-url origin https://${ACCESS_TOKEN}@github.com/${REPO}.git 51 | yarn deploy 52 | env: 53 | ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} 54 | REPO: ${{ github.repository }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # production 12 | /build 13 | /build-demo 14 | /build-pkg 15 | /build-docs 16 | 17 | # misc 18 | .DS_Store 19 | .env.local 20 | .env.development.local 21 | .env.test.local 22 | .env.production.local 23 | .vscode/ 24 | 25 | npm-debug.log* 26 | yarn-debug.log* 27 | yarn-error.log* 28 | 29 | *.profraw 30 | .envrc 31 | 32 | # vim 33 | *.swp 34 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | docs/ 2 | build-demo/ 3 | build/ 4 | pipelines/ 5 | **/nodebooks/* 6 | **/fakedata/* 7 | **/node_modules/* 8 | *.lock 9 | LICENSE 10 | *.log 11 | *.yaml 12 | *.yml 13 | **/dist/* -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 120, 3 | "tabWidth": 4, 4 | "useTabs": true, 5 | "singleQuote": true, 6 | "bracketSpacing": true, 7 | "arrowParens": "avoid", 8 | "trailingComma": "none", 9 | "proseWrap": "never" 10 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cistrome Explorer 2 | 3 | Interactive visual analytic tool for exploring epigenomics data w/ associated metadata, powered by [HiGlass](http://higlass.io/) and [Cistrome Data Browser Toolkit](http://dbtoolkit.cistrome.org/) 4 | 5 | - [Demo](http://cisvis.gehlenborglab.org/) 6 | - [Documentation](http://cisvis.gehlenborglab.org/docs/) 7 | 8 | ### Development 9 | 10 | Install dependencies with yarn ([v1](http://classic.yarnpkg.com)): 11 | 12 | ```sh 13 | yarn 14 | ``` 15 | 16 | Run development server for demo app: 17 | 18 | ```sh 19 | yarn start 20 | ``` 21 | 22 | Run tests: 23 | 24 | ```sh 25 | yarn test 26 | # or 27 | yarn test --watch 28 | ``` 29 | 30 | ### Production 31 | 32 | Build demo app for production: 33 | 34 | ```sh 35 | yarn build 36 | ``` 37 | 38 | Build for NPM package: 39 | 40 | ```sh 41 | yarn build-pkg 42 | ``` 43 | 44 | ### Docs 45 | 46 | ```sh 47 | yarn docs 48 | ``` 49 | 50 | ### Data processing resources 51 | 52 | - Notebooks for short analyses or file conversions: [./notebooks/](./notebooks/) 53 | - Pipeline for combining CistromeDB bigWig files into HiGlass multivec (HDF5-based) files: [./pipelines/cistrome-to-multivec/](./pipelines/cistrome-to-multivec/) 54 | 55 | ### Presentations and other resources 56 | 57 | - [Video](https://drive.google.com/file/d/1SrtFHrEuJY5zHuPjPkBmPTxgZPRQ0qRR/view) & [Slides](https://drive.google.com/file/d/1Z4tO-lrClZY3P7_n2N3kar5YoQoMNVCh/view?usp=sharing) for [NCI ITCR 2020 virtual poster](https://ncihub.org/groups/itcr/2020_virtual_posters) 58 | - [Poster](https://drive.google.com/file/d/1r0jPwyTlEYGotsrfD2KbJU5r-OEYU5Q5/view?usp=sharing) for [BioVis@ISMB 2020](http://biovis.net/2020/program_ismb/) 59 | 60 | ### Related repositories 61 | 62 | - [HiGlass](https://github.com/higlass/higlass) 63 | - [HiGlass Server](https://github.com/higlass/higlass-server) and our fork [Cistrome Explorer HiGlass Server](https://github.com/hms-dbmi/cistrome-explorer-higlass-server) 64 | - To deploy `cistrome-explorer-higlass-server` to AWS ECS please refer to the modified `docker-context/` directory and associated README [here](https://github.com/hms-dbmi/cistrome-explorer-higlass-server/blob/develop/docker-context/README.md). 65 | - [clodius](https://github.com/higlass/clodius) 66 | - [pybbi](https://github.com/nvictus/pybbi) 67 | -------------------------------------------------------------------------------- /dist/assets/deflate-4cf9f098-2cfae813.js: -------------------------------------------------------------------------------- 1 | import{i as r}from"./pako.esm-68f84e2a-fb87a468.js";import{B as a}from"./index-3c0b6e34.js";class s extends a{decodeBlock(e){return r(new Uint8Array(e)).buffer}}export{s as default}; 2 | -------------------------------------------------------------------------------- /dist/assets/lzw-68593c7a-14a83b0e.js: -------------------------------------------------------------------------------- 1 | import{B as b}from"./index-3c0b6e34.js";const B=9,E=256,p=257,k=12;function v(c,o,r){const i=o%8,t=Math.floor(o/8),h=8-i,g=o+r-(t+1)*8;let l=8*(t+2)-(o+r);const w=(t+2)*8-o;if(l=Math.max(0,l),t>=c.length)return console.warn("ran off the end of the buffer before finding EOI_CODE (end on input code)"),p;let u=c[t]&2**(8-i)-1;u<<=r-h;let s=u;if(t+1>>l;f<<=Math.max(0,r-w),s+=f}if(g>8&&t+2>>f;s+=n}return s}function D(c,o){for(let r=o.length-1;r>=0;r--)c.push(o[r]);return c}function x(c){const o=new Uint16Array(4093),r=new Uint8Array(4093);for(let e=0;e<=257;e++)o[e]=4096,r[e]=e;let i=258,t=B,h=0;function g(){i=258,t=B}function l(e){const a=v(e,h,t);return h+=t,a}function w(e,a){return r[i]=a,o[i]=e,i++,i-1}function u(e){const a=[];for(let y=e;y!==4096;y=o[y])a.push(r[y]);return a}const s=[];g();const f=new Uint8Array(c);let n=l(f),d;for(;n!==p;){if(n===E){for(g(),n=l(f);n===E;)n=l(f);if(n===p)break;if(n>E)throw new Error(`corrupted code at scanline ${n}`);{const e=u(n);D(s,e),d=n}}else if(n=2**t&&(t===k?d=void 0:t++),n=l(f)}return new Uint8Array(s)}class C extends b{decodeBlock(o){return x(o).buffer}}export{C as default}; 2 | -------------------------------------------------------------------------------- /dist/assets/packbits-c500b823-7261db87.js: -------------------------------------------------------------------------------- 1 | import{B as c}from"./index-3c0b6e34.js";class l extends c{decodeBlock(s){const n=new DataView(s),r=[];for(let e=0;e"u")throw new Error("Cannot decode WebImage as `createImageBitmap` is not available");if(typeof document>"u"&&typeof OffscreenCanvas>"u")throw new Error("Cannot decode WebImage as neither `document` nor `OffscreenCanvas` is not available")}async decode(i,n){const o=new Blob([n]),e=await createImageBitmap(o);let t;typeof document<"u"?(t=document.createElement("canvas"),t.width=e.width,t.height=e.height):t=new OffscreenCanvas(e.width,e.height);const a=t.getContext("2d");return a.drawImage(e,0,0),a.getImageData(0,0,e.width,e.height).data.buffer}}export{s as default}; 2 | -------------------------------------------------------------------------------- /dist/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | Cistrome Explorer 14 | 15 | 16 | 17 | 18 |
19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/Introduction.md: -------------------------------------------------------------------------------- 1 | [Cistrome Explorer](http://cisvis.gehlenborglab.org) is an interactive visualization tool to explore and analyze [cistrome data](http://cistrome.org/db/#/). 2 | Cistrome Explorer uses [HiGlass](https://higlass.io) to show cistrome data in heatmaps while providing HiGlass wrapper components for showing metadata, such as dendromgram for hierarchical clustering results and bar charts for quality scores. These wrapper components are highly interactive which allow ones to rearrange, filter, and highlight rows in HiGlass heatmaps. Take a look at the [demo](http://cisvis.gehlenborglab.org) or [source codes](https://github.com/hms-dbmi/cistrome-explorer) in GitHub. 3 | 4 | [Cistrome Explorer](http://cisvis.gehlenborglab.org) support two types of wrapper components: *sample-wise* and *genome-wise* components. 5 | 6 | Sample-wise components: 7 | 8 | 1. **Nominal Bar Chart**: A nominal data field, such as a cell type, can be visualized using a nominal bar chart that shows text labels with different colors for individual categories. 9 | 1. **Quantitative Bar Chart**: One or more quantitative data fields, such as quality scores for individual samples, can be represented as a horizontal (stacked) bar chart. 10 | 1. **Dendrogram**: Tree-structured data, such as hierarchical clustering results, can be visualized using a Dendrogram track. You can filter or highlight rows by selecting a branch or moving a minimum similarity bar using the mouse cursor. 11 | 12 | 1. **Link**: Useful external links for individual samples can be shown with a Link track where clicking on a link on this track opens a linked website. 13 | 14 | Genome-wise component: 15 | 16 | 1. **Genomic Interval Selection Bar**: To support querying for [Cistrome DB](http://cistrome.org/db/#/), a genomic interval selection bar can be displayed on the bottom of each HiGlass heatmap. On this component, you can select multiple regions of interest and query for a data table that shows factors bound in these regions. To learn more about the query, please refer to [Cistrome ToolKit](http://dbtoolkit.cistrome.org). 17 | 18 | You can set up the wrapper components using the JSON format specification. 19 | For example, the following specification generates four tracks on the left and right of a HiGlass heatmap with two sorting and one filtering options: 20 | 21 | ```javascript 22 | { 23 | rowInfoAttributes: [ 24 | {field: "Hierarchical Clustering (Average)", type: "tree", position: "left"}, 25 | {field: "qc_frip", type: "quantitative", position: "left"}, 26 | {field: "qc_fastqc", type: "quantitative", position: "left"}, 27 | {field: "Metadata URL", type: "url", position: "left", title: "cid"}, 28 | {field: "Hierarchical Clustering (Ward)", type: "tree", position: "right"}, 29 | {field: "Cell Type", type: "nominal", position: "right"}, 30 | {field: "Tissue Type", type: "nominal", position: "right"}, 31 | {field: "Species", type: "nominal", position: "right"} 32 | ], 33 | rowSort: [ 34 | {field: "Tissue Type", type: "nominal", order: "ascending"}, 35 | {field: "qc_frip", type: "quantitative", order: "descending"} 36 | ], 37 | rowFilter: [ 38 | {field: "Tissue Type", type: "nominal", notOneOf: ["None"]} 39 | ] 40 | } 41 | ``` 42 | 43 | 44 | 45 | Please refer to the definition of the [JSON schema](https://github.com/hms-dbmi/cistrome-explorer/blob/e1f9d2e83fa7af684f6cb827b8f7aae92a6f6b8a/src/utils/options.js#L9). -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 13 | 14 | 19 | 20 | 21 | Cistrome Explorer 22 | 23 | 24 |
25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .snakemake/ 3 | __pycache__/ 4 | *.csv 5 | *.arrow 6 | *.zarr 7 | *.txt 8 | *.mv5 9 | *.bed 10 | *.bw 11 | *.json 12 | data/ -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # notebooks 2 | 3 | One-off jupyter notebooks used for small data processing tasks can be placed here. 4 | 5 | Consider extracting tasks that grow more intense / frequently-used, converting to more robust [`higlass-server`](https://github.com/higlass/higlass-server/) or [`clodius`](https://github.com/higlass/clodius) processing steps. 6 | 7 | ## Setup 8 | 9 | ```sh 10 | conda env create -f environment.yml 11 | ``` 12 | 13 | ## Start 14 | 15 | ```sh 16 | conda activate cistrome-explorer-notebooks 17 | jupyter notebook 18 | ``` 19 | -------------------------------------------------------------------------------- /notebooks/data_processing_20200429/README.md: -------------------------------------------------------------------------------- 1 | After running the notebook, to ingest the tileset using higlass server's `manage.py` script: 2 | 3 | ```sh 4 | python manage.py ingest_tileset \ 5 | --uid 'Cistrome_DNase_1kb_average' \ 6 | --filename ./Cistrome_DNase_1kb_average_chr1_to_chr22.multires.mv5 \ 7 | --filetype multivec \ 8 | --coordSystem hg38 9 | 10 | ``` 11 | -------------------------------------------------------------------------------- /notebooks/data_processing_20200520/create_large_multivec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import h5py\n", 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import json\n", 13 | "import os\n", 14 | "from os.path import join\n", 15 | "import tempfile\n", 16 | "import negspy.coordinates as nc" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "f = h5py.File(\"../data_processing_20200429/my_file_genome_wide_20180228.multires.mv5\", \"r\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "def descend_obj(obj,sep='\\t'):\n", 35 | " \"\"\"\n", 36 | " Iterate through groups in a HDF5 file and prints the groups and datasets names and datasets attributes\n", 37 | " \"\"\"\n", 38 | " if type(obj) in [h5py._hl.group.Group,h5py._hl.files.File]:\n", 39 | " for key in obj.keys():\n", 40 | " print(sep,'-',key,':',obj[key])\n", 41 | " descend_obj(obj[key],sep=sep+'\\t')\n", 42 | " elif type(obj)==h5py._hl.dataset.Dataset:\n", 43 | " for key in obj.attrs.keys():\n", 44 | " print(sep+'\\t','-',key,':',obj.attrs[key])" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "descend_obj(f)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "type(f)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "def create_multivec_file():\n", 72 | " tf = tempfile.TemporaryFile()\n", 73 | " f = h5py.File(tf, 'w')\n", 74 | " \n", 75 | " # Create level zero groups\n", 76 | " info_group = f.create_group(\"info\")\n", 77 | " resolutions_group = f.create_group(\"resolutions\")\n", 78 | " \n", 79 | " # Prepare to fill in chroms dataset\n", 80 | " chromosomes = nc.get_chromorder('hg38')\n", 81 | " num_chromosomes = len(chromosomes)\n", 82 | " chroms_length_arr = np.array([ nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes ], dtype=\"i8\")\n", 83 | " chroms_name_arr = np.array(chromosomes, dtype=\"S23\")\n", 84 | " \n", 85 | " def create_chroms_group(d):\n", 86 | " chroms_group = d.create_group(\"chroms\")\n", 87 | " # Fill in chroms dataset entries \"length\" and \"name\"\n", 88 | " chroms_group.create_dataset(\"length\", data=chroms_length_arr)\n", 89 | " chroms_group.create_dataset(\"name\", data=chroms_name_arr)\n", 90 | " \n", 91 | " create_chroms_group(f)\n", 92 | " \n", 93 | " # Prepare to fill in resolutions dataset\n", 94 | " resolutions = [ str(1000*(2**x)) for x in range(15)]\n", 95 | " print(resolutions)\n", 96 | " \n", 97 | " # Fill in resolutions dataset entries\n", 98 | " for resolution in resolutions:\n", 99 | " # Create each resolution group\n", 100 | " resolution_group = resolutions_group.create_group(resolution)\n", 101 | " \n", 102 | " create_chroms_group(resolution_group)\n", 103 | " resolution_values_group = resolution_group.create_group(\"values\")\n", 104 | " \n", 105 | " for chromosome in chromosomes:\n", 106 | " # TODO: fill in the chromosome values\n", 107 | " chrom_dataset = resolution_group.create_dataset(chromosome, (1, 1), dtype=\"f4\")\n", 108 | " \n", 109 | " return f\n", 110 | " " 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "len(nc.get_chromorder('hg38'))" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "nc.get_chrominfo('hg38').chrom_lengths[\"chr2\"]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "descend_obj(create_multivec_file())" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python [conda env:cistrome-explorer-notebooks] *", 158 | "language": "python", 159 | "name": "conda-env-cistrome-explorer-notebooks-py" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.7.6" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 4 176 | } 177 | -------------------------------------------------------------------------------- /notebooks/environment.yml: -------------------------------------------------------------------------------- 1 | name: cistrome-explorer-notebooks 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - scottx611x 6 | dependencies: 7 | - python>=3.5 8 | - numpy 9 | - pandas 10 | - requests 11 | - h5py 12 | - dask 13 | - flake8 14 | - autopep8 15 | - pysam 16 | - snakemake 17 | - scipy 18 | - jupyter 19 | - nb_conda 20 | - pip 21 | - pip: 22 | - negspy 23 | - slugid 24 | - sortedcontainers 25 | - nose 26 | - cooler>=0.8.5 27 | - click>=7 28 | - pytest 29 | - pytest-cov 30 | - bumpversion 31 | - pybbi>=0.2.2 32 | - clodius 33 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cistrome-explorer", 3 | "version": "0.6.0", 4 | "author": "Gehlenborg Lab (http://gehlenborglab.org/)", 5 | "license": "MIT", 6 | "repository": { 7 | "type": "git", 8 | "url": "https://github.com/hms-dbmi/cistrome-explorer" 9 | }, 10 | "homepage": "http://cisvis.gehlenborglab.org", 11 | "dependencies": { 12 | "@vitessce/zarr": "^3.0.0", 13 | "ajv": "^6.11.0", 14 | "d3": "^5.15.0", 15 | "d3-delaunay": "^5.2.1", 16 | "gosling-theme": "^0.0.10", 17 | "higlass-multivec": "0.2.8", 18 | "higlass-register": "^0.3.0", 19 | "js-cookie": "^2.2.1", 20 | "lodash": "^4.17.19", 21 | "node-sass": "^6.0.1", 22 | "pubsub-js": "^1.8.0", 23 | "tree.gl": "^0.0.1", 24 | "uuid": "^3.4.0", 25 | "vega-scale": "^6.0.0" 26 | }, 27 | "peerDependencies": { 28 | "gosling.js": "^0.9.31", 29 | "higlass": "^1.12.4", 30 | "pixi.js": "^5.0.3", 31 | "react": "^16.12.0", 32 | "react-bootstrap": "0.32.1", 33 | "react-dom": "^16.12.0" 34 | }, 35 | "devDependencies": { 36 | "@vitejs/plugin-react": "^4.0.0", 37 | "deep-diff": "^1.0.2", 38 | "documentation": "^12.1.4", 39 | "eslint": "^8.1.0", 40 | "eslint-config-prettier": "^8.8.0", 41 | "eslint-plugin-jest": "^25.2.2", 42 | "eslint-plugin-prettier": "^4.2.1", 43 | "eslint-plugin-react": "^7.32.2", 44 | "gh-pages": "^3.2.3", 45 | "gosling.js": "^0.9.31", 46 | "higlass": "^1.12.4", 47 | "levenary": "^1.1.1", 48 | "pixi.js": "^5.0.3", 49 | "prettier": "^2.8.8", 50 | "sass": "^1.25.0", 51 | "vite": "^4.3.9", 52 | "vitest": "^0.32.0" 53 | }, 54 | "scripts": { 55 | "start": "vite --port 8000", 56 | "build": "vite build", 57 | "preview": "vite preview --port 8000", 58 | "test": "vitest", 59 | "format": "eslint src --fix && prettier --write *", 60 | "docs": "documentation build src/** -f html -o build-docs --config .documentation.yml", 61 | "predeploy": "yarn build; echo \"cisvis.gehlenborglab.org\" >> build-demo/CNAME", 62 | "deploy": "gh-pages -d build-demo" 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .snakemake/ 3 | __pycache__/ 4 | *.csv 5 | *.arrow 6 | *.zarr 7 | *.txt 8 | *.mv5 9 | *.bed 10 | *.bw 11 | *.json 12 | *.out 13 | *.err 14 | data/ 15 | -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/README.md: -------------------------------------------------------------------------------- 1 | 2 | # pipelines/cistrome-to-multivec 3 | 4 | Generate multivec files from CistromeDB bigWig files. 5 | 6 | ```sh 7 | conda activate cistrome-to-multivec-pipeline 8 | # to generate multivec outputs 9 | snakemake --cores 2 --config filetype=mv5 10 | # or, to generate zarr outputs 11 | snakemake --cores 2 --config filetype=zarr 12 | # or, if on O2 (replace with your O2 username) 13 | ./submit.sh mv5 my_username 14 | ``` 15 | 16 | ## Setup 17 | 18 | ### Create conda environment 19 | 20 | ```sh 21 | conda env create -f environment.yml 22 | conda activate cistrome-to-multivec-pipeline 23 | ``` 24 | 25 | ### Copy snakemake cluster config 26 | 27 | ```sh 28 | mkdir -p ~/.config/snakemake/cistrome-explorer 29 | cp ./cluster-profile.yml ~/.config/snakemake/cistrome-explorer/config.yaml 30 | ``` 31 | 32 | ### Sync output files with s3 bucket 33 | 34 | ```sh 35 | # replace with your credentials 36 | export AWS_ACCESS_KEY_ID="{my_access_key_id}" 37 | export AWS_SECRET_ACCESS_KEY="{my_secret_access_key}" 38 | export AWS_DEFAULT_REGION="us-east-1" 39 | 40 | # replace with your O2 username details 41 | # .../users/{first_letter_of_username}/{username}/cistrome-explorer/... 42 | aws s3 sync /n/scratch3/users/m/mk596/cistrome-explorer/data/processed/ s3://higlass-server/CistromeDB/ 43 | ``` 44 | 45 | 46 | ### Using parallel hdf5 via h5py and mpi4py 47 | 48 | *The following info is outdated, since h5py does not yet work with the parallel version of hdf5 installed on the o2 cluster. In the meantime we can do parallelization by submitting many simultaneous snakemake jobs for each output bigwig file.* 49 | 50 | https://docs.h5py.org/en/latest/build.html#building-against-parallel-hdf5 51 | 52 | #### On O2 53 | 54 | ```sh 55 | module load gcc/6.2.0 56 | module load openmpi/3.1.0 57 | module load hdf5/1.12.0 58 | 59 | which mpicc 60 | which h5cc # doesn't work for some reason - but the hdf5 dir is /n/app/hdf5/1.12.0.parallel 61 | 62 | # CC="mpicc" HDF5_MPI="ON" HDF5_DIR=/n/app/hdf5/1.12.0.parallel pip install --no-binary=h5py h5py # doesn't work since pip h5py not compatible with 1.12.0 63 | 64 | cd path/to/h5py-parent 65 | # Clone h5py so that the latest code with support for hdf5 v1.12.0 (since not yet on pip). 66 | git clone git@github.com:h5py/h5py.git 67 | 68 | cd h5py 69 | python setup.py configure --hdf5=/n/app/hdf5/1.12.0.parallel 70 | python setup.py configure --mpi 71 | python setup.py install 72 | ``` 73 | 74 | #### On macOS 75 | 76 | Download hdf5 1.10.6 source code from https://www.hdfgroup.org/downloads/hdf5/source-code/ and un-tar-gz 77 | 78 | ```sh 79 | brew install openmpi 80 | 81 | # Make a directory in which hdf5 can be installed. 82 | mkdir -p ~/software/hdf5 83 | 84 | brew info openmpi # Use this to find the CC value for the next line. 85 | 86 | # In the downloaded hdf5-1.10.6 source directory: 87 | CC=/usr/local/Cellar/open-mpi/4.0.3/bin/mpicc ./configure --enable-parallel --enable-shared --prefix=$HOME/software/hdf5 88 | make 89 | export NPROCS=3 # https://github.com/open-mpi/ompi/issues/6497 90 | make check 91 | make install 92 | 93 | cd path/to/h5py-parent 94 | # Clone h5py so that the latest code with support for hdf5 v1.12.0 (since not yet on pip). 95 | git clone git@github.com:h5py/h5py.git 96 | 97 | cd h5py 98 | export CC=/usr/local/Cellar/open-mpi/4.0.3/bin/mpicc 99 | python setup.py configure --hdf5=$HOME/software/hdf5 100 | python setup.py configure --mpi 101 | python setup.py install 102 | cd .. 103 | python 104 | >>> import h5py 105 | >>> h5py.get_config().mpi # Should return True if MPI has been enabled 106 | ``` 107 | -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/Snakefile: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join 3 | import json 4 | import requests 5 | import platform 6 | 7 | configfile: 'config.yml' 8 | 9 | O2_USER = config.get("user", None) 10 | 11 | # Check if this is running on O2 12 | IS_O2 = (platform.system() == "Linux" and O2_USER != None) 13 | 14 | print("This is", (" " if IS_O2 else " not "), "running on O2", sep='') 15 | 16 | # Directory / file constants 17 | SRC_DIR = "src" 18 | DATA_DIR = ("data" if not IS_O2 else join(os.sep, "n", "scratch3", "users", O2_USER[0], O2_USER, "cistrome-explorer", "data")) 19 | RAW_DIR = join(DATA_DIR, "raw") 20 | INTERMEDIATE_DIR = join(DATA_DIR, "intermediate") 21 | PROCESSED_DIR = join(DATA_DIR, "processed") 22 | 23 | # URL constants 24 | SAMPLE_DATA_URL = "http://dc2.cistrome.org/api/datahub/{cid}" 25 | SAMPLE_METADATA_URL = "http://dc2.cistrome.org/api/inspector?id={cid}" 26 | 27 | # Process the config 28 | #GROUP_NAMES = config["groups"].keys() # TODO: uncomment to create processed files for all available species & factors 29 | #GROUP_NAMES = ['Homo_sapiens__AEBP2__all', 'Homo_sapiens__AFF1__all', 'Homo_sapiens__AFF4__all'] # TODO: remove 30 | GROUP_NAMES = [ name for name, group in config["groups"].items() if len(group) < 100 and name[0] == "H" ][:200] # TODO: remove 31 | 32 | FILETYPE = config.get("filetype", "mv5") 33 | 34 | # Zarr outputs are really directories, so they need to be wrapped in snakemake's directory() 35 | # function. 36 | # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#directories-as-outputs 37 | OUTFILE_WRAPPER = directory if FILETYPE == "zarr" else (lambda x: x) 38 | 39 | # Rules 40 | rule all: 41 | input: 42 | expand(join(PROCESSED_DIR, f"{{group_name}}.multires.{FILETYPE}"), group_name=GROUP_NAMES), 43 | 44 | # Given a manifest file (containing an array of metadata.json file paths and cooresponding bigWig file paths), 45 | # create a single output file (either multivec or zarr format). 46 | # Here, we can use the snakemake "groups" feature to reduce the number of cluster job submissions. 47 | # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html?highlight=group#defining-groups-for-execution 48 | rule manifest_to_outfile: 49 | group: "outfile_group" 50 | input: 51 | bigwigs=lambda w: expand(join(RAW_DIR, "{cid}.bw"), cid=config["groups"][w.group_name]), 52 | metadata=lambda w: expand(join(RAW_DIR, "{cid}.metadata.json"), cid=config["groups"][w.group_name]), 53 | manifest=join(INTERMEDIATE_DIR, "{group_name}.manifest.json") 54 | output: 55 | OUTFILE_WRAPPER(join(PROCESSED_DIR, f"{{group_name}}.multires.{FILETYPE}")) 56 | params: 57 | starting_resolution=200, 58 | script=join(SRC_DIR, f"manifest_to_{FILETYPE}.py") 59 | shell: 60 | """ 61 | python {params.script} \ 62 | -i {input.manifest} \ 63 | -o {output} \ 64 | -s {params.starting_resolution} \ 65 | -n {wildcards.group_name} 66 | """ 67 | 68 | # Create a "manifest" file for each output group. 69 | # This just simplifies the command line arguments for the 70 | # manifest_to_mv5.py and manifest_to_zarr.py scripts. 71 | rule bigwigs_to_manifest: 72 | group: "outfile_group" 73 | input: 74 | bigwigs=lambda w: expand(join(RAW_DIR, "{cid}.bw"), cid=config["groups"][w.group_name]), 75 | metadata=lambda w: expand(join(RAW_DIR, "{cid}.metadata.json"), cid=config["groups"][w.group_name]), 76 | output: 77 | join(INTERMEDIATE_DIR, "{group_name}.manifest.json") 78 | script: 79 | join(SRC_DIR, "bigwigs_to_manifest.py") 80 | 81 | # The Cistrome DB API outputs a JSON array, 82 | # where the first array element contains a URL to 83 | # the bigWig file associated with the sample ID. 84 | # We can tell snakemake that this bigwig file is "temporary", 85 | # so that snakemake will delete it after all rules that use it as an input are completed. 86 | # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#protected-and-temporary-files 87 | rule download_bigwig: 88 | group: "outfile_group" 89 | input: 90 | join(RAW_DIR, "{cid}.data.json") 91 | output: 92 | # 93 | temp(join(RAW_DIR, "{cid}.bw")) 94 | shell: 95 | ''' 96 | curl -s -S -L -o {output} $(cat {input} | jq '.[0].url' -r) 97 | ''' 98 | 99 | # The Cistrome DB has two separate endpoints for each sample: 100 | # - a "data" endpoint which returns JSON containing the path to the bigwig file for the sample 101 | # - a "metadata" endpoint which returns JSON metadata for the sample 102 | rule download_bigwig_metadata: 103 | group: "outfile_group" 104 | output: 105 | metadata_json=join(RAW_DIR, "{cid}.metadata.json"), 106 | data_json=join(RAW_DIR, "{cid}.data.json") 107 | params: 108 | sample_metadata_url=lambda w: SAMPLE_METADATA_URL.format(cid=w.cid), 109 | sample_data_url=lambda w: SAMPLE_DATA_URL.format(cid=w.cid) 110 | shell: 111 | ''' 112 | curl -s -S -L -o {output.data_json} {params.sample_data_url} && \ 113 | curl -s -S -L -o {output.metadata_json} {params.sample_metadata_url} 114 | ''' 115 | 116 | -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/cluster-profile.yml: -------------------------------------------------------------------------------- 1 | cluster: 'sbatch -p medium -n 1 -t 0-16:00:00 --mem=8000' 2 | jobs: 10 3 | latency-wait: 60 -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/environment.yml: -------------------------------------------------------------------------------- 1 | name: cistrome-to-multivec-pipeline 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - scottx611x 6 | dependencies: 7 | - python>=3.5 8 | - numpy 9 | - pandas 10 | - requests 11 | - h5py 12 | - mpi4py 13 | - dask 14 | - flake8 15 | - autopep8 16 | - pysam 17 | - snakemake 18 | - scipy 19 | - jupyter 20 | - nb_conda 21 | - jq 22 | - pyyaml 23 | - tqdm 24 | - zarr 25 | - pip 26 | - pip: 27 | - negspy 28 | - slugid 29 | - sortedcontainers 30 | - nose 31 | - cooler>=0.8.5 32 | - click>=7 33 | - pytest 34 | - pytest-cov 35 | - bumpversion 36 | - pybbi>=0.2.3 37 | - clodius 38 | -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/generate_config.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import yaml 3 | import urllib 4 | import itertools 5 | from tqdm import tqdm 6 | from pprint import pprint 7 | 8 | initial_params = { 9 | 'allqc': False, 10 | 'cellinfos': 'all', 11 | "completed": False, 12 | "curated": False, 13 | "factors": "all", 14 | "keyword": "", 15 | "page": 1, 16 | "peakqc": False, 17 | "run": False, 18 | "species": "all" 19 | } 20 | 21 | CISTROME_DB_BASE_URL = "http://dc2.cistrome.org/api/main_filter_ng?" 22 | 23 | DENYLIST = { 24 | 6413, 25 | 51200, 26 | 94002, 27 | 43368, 28 | } 29 | 30 | def get_cids(specie, factor, bio_source_type, bio_source_id): 31 | params = initial_params.copy() 32 | params['species'] = specie 33 | params['factors'] = factor 34 | 35 | url = CISTROME_DB_BASE_URL + urllib.parse.urlencode(params) 36 | 37 | cids = [] 38 | 39 | r = requests.get(url) 40 | if r.ok: 41 | response_json = r.json() 42 | cids += [ d["id"] for d in response_json["datasets"] ] 43 | num_pages = response_json['num_pages'] 44 | 45 | for page in range(2, num_pages + 1): 46 | page_params = params.copy() 47 | page_params["page"] = page 48 | page_url = CISTROME_DB_BASE_URL + urllib.parse.urlencode(page_params) 49 | page_r = requests.get(page_url) 50 | if page_r.ok: 51 | page_response_json = page_r.json() 52 | cids += [ d["id"] for d in page_response_json["datasets"] ] 53 | # Use a set to eliminate duplicates 54 | return list(set(cids) - DENYLIST) 55 | 56 | def get_factors_by_species(specie): 57 | params = initial_params.copy() 58 | params['species'] = specie 59 | 60 | url = CISTROME_DB_BASE_URL + urllib.parse.urlencode(params) 61 | 62 | r = requests.get(url) 63 | if r.ok: 64 | response_json = r.json() 65 | return response_json["factors"] 66 | else: 67 | print(f"Error: get_factors_by_species({specie})") 68 | return [] 69 | 70 | def get_bio_sources_by_species_and_factor(specie, factor): 71 | params = initial_params.copy() 72 | params['species'] = specie 73 | params['factors'] = factor 74 | 75 | url = CISTROME_DB_BASE_URL + urllib.parse.urlencode(params) 76 | 77 | r = requests.get(url) 78 | if r.ok: 79 | response_json = r.json() 80 | return response_json["cellinfos"] 81 | else: 82 | print(f"Error: get_bio_sources_by_species_and_factor({specie}, {factor})") 83 | return [] 84 | 85 | def generate_config(): 86 | 87 | config = { 88 | "groups": {} 89 | } 90 | 91 | initial_url = CISTROME_DB_BASE_URL + urllib.parse.urlencode(initial_params) 92 | 93 | r = requests.get(initial_url) 94 | if r.ok: 95 | index_json = r.json() 96 | 97 | all_species = index_json['species'] 98 | 99 | for specie in all_species: 100 | print(specie) 101 | specie_factors = get_factors_by_species(specie) 102 | for factor in tqdm(specie_factors): 103 | name = f"{specie}__{factor}__all".replace(" ", "_") 104 | cids = get_cids(specie, factor, None, None) 105 | 106 | config["groups"][name] = cids 107 | return config 108 | 109 | 110 | if __name__ == "__main__": 111 | 112 | config = generate_config() 113 | 114 | with open('config.yml', 'w') as f: 115 | yaml.dump(config, f, default_flow_style=False) -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/higlass_ingest.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd "$(dirname "$0")" 4 | 5 | HIGLASS_SERVER_PATH=$1 6 | 7 | echo "Using higlass-server located at ${HIGLASS_SERVER_PATH}" 8 | 9 | for D in $(ls -1 ./data/processed | sort); do 10 | if [ ${D: -13} == ".multires.mv5" ]; then 11 | TILESET_UID=${D::${#D}-13} 12 | echo "Ingesting $TILESET_UID" 13 | if [ ${D:0:1} == "H" ]; then 14 | # Starts with "Homo_sapiens" 15 | ASSEMBLY="hg38" 16 | else 17 | ASSEMBLY="mm10" 18 | fi 19 | #python $HIGLASS_SERVER_PATH/manage.py delete_tileset --uuid=$TILESET_UID 20 | python $HIGLASS_SERVER_PATH/manage.py ingest_tileset \ 21 | --uid $TILESET_UID \ 22 | --filename ./data/processed/$D \ 23 | --filetype multivec \ 24 | --coordSystem $ASSEMBLY 25 | fi 26 | done -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/inspect.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 13, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import h5py\n", 10 | "import json\n", 11 | "from os.path import join" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 14, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "f = h5py.File(join(\"data\", \"processed\", \"test.multires.mv5\"), \"r\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 15, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "" 32 | ] 33 | }, 34 | "execution_count": 15, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "f.keys()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 16, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "{'id': '62349',\n", 52 | " 'status': 'completed',\n", 53 | " 'treats__0__cell_line__name': None,\n", 54 | " 'treats__0__cell_type__name': 'Th1',\n", 55 | " 'treats__0__cell_pop__name': 'CD4+',\n", 56 | " 'treats__0__disease_state__name': None,\n", 57 | " 'treats__0__factor__name': 'H3K27ac',\n", 58 | " 'treats__0__is_correcting': False,\n", 59 | " 'treats__0__link': 'http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2048305',\n", 60 | " 'treats__0__name': 'TH1_K27AC_REP2',\n", 61 | " 'treats__0__paper__journal__name': 'Cell',\n", 62 | " 'treats__0__paper__lab': 'Oltz EM',\n", 63 | " 'treats__0__paper__pmid': 27156452,\n", 64 | " 'treats__0__paper__reference': 'Koues OI, et al. Distinct Gene Regulatory Pathways for Human Innate versus Adaptive Lymphoid Cells. Cell 2016',\n", 65 | " 'treats__0__species__name': 'Homo sapiens',\n", 66 | " 'treats__0__strain__name': None,\n", 67 | " 'treats__0__tissue_type__name': 'Tonsil',\n", 68 | " 'treats__0__unique_id': 'GSM2048305',\n", 69 | " 'qc__judge__map': True,\n", 70 | " 'qc__judge__peaks': False,\n", 71 | " 'qc__judge__fastqc': True,\n", 72 | " 'qc__judge__frip': False,\n", 73 | " 'qc__judge__pbc': True,\n", 74 | " 'qc__judge__motif': False,\n", 75 | " 'qc__judge__dhs': True,\n", 76 | " 'qc__table__treat_number': 1,\n", 77 | " 'qc__table__control_number': 0,\n", 78 | " 'qc__table__map__0': 91.5,\n", 79 | " 'qc__table__fastqc__0': 37,\n", 80 | " 'qc__table__frip__0': 0.1,\n", 81 | " 'qc__table__map_number__0': 15195930,\n", 82 | " 'qc__table__pbc__0': 93.2,\n", 83 | " 'qc__table__motif': False,\n", 84 | " 'qc__table__dhs': 76.7,\n", 85 | " 'qc__table__raw_number__0': 16614502,\n", 86 | " 'qc__table__meta_orig__intron': 0.38271604938271603,\n", 87 | " 'qc__table__meta_orig__inter': 0.5432098765432098,\n", 88 | " 'qc__table__meta_orig__exon': 0.012345679012345678,\n", 89 | " 'qc__table__meta_orig__promoter': 0.06172839506172839}" 90 | ] 91 | }, 92 | "execution_count": 16, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "json.loads(f[\"resolutions\"][\"16384000\"].attrs[\"row_infos\"][0])" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python [conda env:cistrome-explorer-notebooks] *", 112 | "language": "python", 113 | "name": "conda-env-cistrome-explorer-notebooks-py" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.7.6" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 4 130 | } 131 | -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/src/bigwigs_to_manifest.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def bigwigs_to_manifest( 4 | input_bigwig_files, 5 | input_metadata_files, 6 | output_file 7 | ): 8 | 9 | manifest_json = { 10 | "input_bigwig_files": input_bigwig_files, 11 | "input_metadata_files": input_metadata_files, 12 | } 13 | 14 | with open(output_file, "w") as f: 15 | json.dump(manifest_json, f) 16 | 17 | 18 | if __name__ == "__main__": 19 | 20 | bigwigs_to_manifest( 21 | list(snakemake.input["bigwigs"]), 22 | list(snakemake.input["metadata"]), 23 | snakemake.output[0] 24 | ) -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/src/manifest_to_mv5.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import bbi 3 | import negspy.coordinates as nc 4 | import numpy as np 5 | import math 6 | import argparse 7 | import json 8 | from tqdm import tqdm 9 | import resource 10 | 11 | from utils import metadata_json_to_row_info, get_manifest_to_outfile_parser 12 | 13 | def bigwigs_to_multivec( 14 | input_bigwig_files, 15 | input_metadata_files, 16 | output_file, 17 | starting_resolution 18 | ): 19 | 20 | f = h5py.File(output_file, 'w') 21 | 22 | num_samples = len(input_bigwig_files) 23 | 24 | # Zip the input to create (bw, metadata) tuples 25 | zipped_input = zip(input_bigwig_files, input_metadata_files) 26 | 27 | # Create level zero groups 28 | info_group = f.create_group("info") 29 | resolutions_group = f.create_group("resolutions") 30 | chroms_group = f.create_group("chroms") 31 | 32 | # Set info attributes 33 | info_group.attrs['tile-size'] = 256 34 | 35 | # Prepare to fill in chroms dataset 36 | chromosomes = nc.get_chromorder('hg38') 37 | chromosomes = chromosomes[:25] # TODO: should more than chr1-chrM be used? 38 | num_chromosomes = len(chromosomes) 39 | chroms_length_arr = np.array([ nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes ], dtype="i8") 40 | chroms_name_arr = np.array(chromosomes, dtype="S23") 41 | 42 | chromosomes_set = set(chromosomes) 43 | chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) 44 | 45 | # Fill in chroms dataset entries "length" and "name" 46 | chroms_group.create_dataset("length", data=chroms_length_arr) 47 | chroms_group.create_dataset("name", data=chroms_name_arr) 48 | 49 | 50 | # Prepare to fill in resolutions dataset 51 | resolutions = [ starting_resolution*(2**x) for x in range(16)] 52 | 53 | # Create each resolution group. 54 | for resolution in resolutions: 55 | resolution_group = resolutions_group.create_group(str(resolution)) 56 | # TODO: remove the unnecessary "values" layer 57 | resolution_values_group = resolution_group.create_group("values") 58 | 59 | # Create each chromosome dataset. 60 | for chr_name, chr_len in zip(chromosomes, chroms_length_arr): 61 | chr_shape = (math.ceil(chr_len / resolution), num_samples) 62 | resolution_values_group.create_dataset(chr_name, chr_shape, dtype="f4", fillvalue=np.nan, compression='gzip') 63 | 64 | # Fill in data for each bigwig file. 65 | for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)), desc='bigwigs'): 66 | if bbi.is_bigwig(bw_file): 67 | chromsizes = bbi.chromsizes(bw_file) 68 | matching_chromosomes = set(chromsizes.keys()).intersection(chromosomes_set) 69 | 70 | # Fill in data for each resolution of a bigwig file. 71 | for resolution in resolutions: 72 | # Fill in data for each chromosome of a resolution of a bigwig file. 73 | for chr_name in matching_chromosomes: 74 | chr_len = chrom_name_to_length[chr_name] 75 | chr_shape = (math.ceil(chr_len / resolution), num_samples) 76 | arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[0], summary="sum") 77 | resolutions_group[str(resolution)]["values"][chr_name][:,bw_index] = arr 78 | else: 79 | print(f"{bw_file} not is_bigwig") 80 | 81 | f.flush() 82 | 83 | f.close() 84 | 85 | max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss 86 | print(max_mem) 87 | 88 | # Append metadata to the top resolution row_infos attribute. 89 | row_infos = [] 90 | for metadata_index, metadata_file in enumerate(input_metadata_files): 91 | with open(metadata_file) as mf: 92 | try: 93 | metadata_json = json.load(mf) 94 | except Exception as e: 95 | print(f"Error loading metadata file: {metadata_file}") 96 | print(e) 97 | metadata_json = None 98 | row_info = metadata_json_to_row_info(metadata_json) 99 | row_infos.append(row_info) 100 | 101 | row_infos_encoded = str(json.dumps(row_infos)) 102 | 103 | f = h5py.File(output_file, 'r+') 104 | 105 | info_group = f["info"] 106 | info_group["row_infos"] = row_infos_encoded 107 | 108 | f.close() 109 | 110 | 111 | if __name__ == "__main__": 112 | parser = get_manifest_to_outfile_parser() 113 | args = parser.parse_args() 114 | 115 | with open(args.input) as f: 116 | manifest_json = json.load(f) 117 | input_bigwig_files = manifest_json['input_bigwig_files'] 118 | input_metadata_files = manifest_json['input_metadata_files'] 119 | 120 | bigwigs_to_multivec( 121 | input_bigwig_files, 122 | input_metadata_files, 123 | args.output, 124 | args.starting_resolution 125 | ) -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/src/manifest_to_zarr.py: -------------------------------------------------------------------------------- 1 | import zarr 2 | from numcodecs import Zlib 3 | import bbi 4 | import negspy.coordinates as nc 5 | import numpy as np 6 | import math 7 | import argparse 8 | import json 9 | from tqdm import tqdm 10 | import resource 11 | 12 | from utils import metadata_json_to_row_info, name_to_coordsystem, get_manifest_to_outfile_parser 13 | 14 | def bigwigs_to_zarr( 15 | input_bigwig_files, 16 | input_metadata_files, 17 | output_file, 18 | starting_resolution, 19 | name 20 | ): 21 | 22 | # Short-hand for creating a DirectoryStore with a root group. 23 | f = zarr.open(output_file, mode='w') 24 | compressor = Zlib(level=1) 25 | 26 | num_samples = len(input_bigwig_files) 27 | 28 | # Zip the input to create (bw, metadata) tuples 29 | zipped_input = zip(input_bigwig_files, input_metadata_files) 30 | 31 | # Create level zero groups 32 | chromosomes_group = f.create_group("chromosomes") 33 | 34 | # Prepare to fill in chroms dataset 35 | chromosomes = nc.get_chromorder('hg38') 36 | chromosomes = [ str(chr_name) for chr_name in chromosomes[:25] ] # TODO: should more than chr1-chrM be used? 37 | num_chromosomes = len(chromosomes) 38 | chroms_length_arr = np.array([ nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes ], dtype="i8") 39 | chroms_cumsum_arr = np.concatenate((np.array([0]), np.cumsum(chroms_length_arr))) 40 | 41 | chromosomes_set = set(chromosomes) 42 | chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr)) 43 | chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr)) 44 | 45 | 46 | # Prepare to fill in resolutions dataset 47 | resolutions = [ starting_resolution*(2**x) for x in range(16)] 48 | 49 | # Create each chromosome dataset. 50 | for chr_name, chr_len in chrom_name_to_length.items(): 51 | chr_group = chromosomes_group.create_group(chr_name) 52 | # Create each resolution group. 53 | for resolution in resolutions: 54 | chr_shape = (num_samples, math.ceil(chr_len / resolution)) 55 | chr_group.create_dataset(str(resolution), shape=chr_shape, dtype="f4", fill_value=np.nan, compressor=compressor) 56 | 57 | # Fill in data for each bigwig file. 58 | for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)), desc='bigwigs'): 59 | if bbi.is_bigwig(bw_file): 60 | chromsizes = bbi.chromsizes(bw_file) 61 | matching_chromosomes = set(chromsizes.keys()).intersection(chromosomes_set) 62 | 63 | # Fill in data for each resolution of a bigwig file. 64 | for resolution in resolutions: 65 | # Fill in data for each chromosome of a resolution of a bigwig file. 66 | for chr_name in matching_chromosomes: 67 | chr_len = chrom_name_to_length[chr_name] 68 | chr_shape = (num_samples, math.ceil(chr_len / resolution)) 69 | arr = bbi.fetch(bw_file, chr_name, 0, chr_len, chr_shape[1], summary="sum") 70 | chromosomes_group[chr_name][str(resolution)][bw_index,:] = arr 71 | else: 72 | print(f"{bw_file} not is_bigwig") 73 | 74 | max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss 75 | print(max_mem) 76 | 77 | # Append metadata to the top resolution row_infos attribute. 78 | row_infos = [] 79 | for metadata_index, metadata_file in enumerate(input_metadata_files): 80 | with open(metadata_file) as mf: 81 | metadata_json = json.load(mf) 82 | row_info = metadata_json_to_row_info(metadata_json) 83 | row_infos.append(row_info) 84 | 85 | # f.attrs should contain all tileset_info properties 86 | # For zarr, more attributes are used here to allow "serverless" 87 | f.attrs['row_infos'] = row_infos 88 | f.attrs['resolutions'] = sorted(resolutions, reverse=True) 89 | f.attrs['shape'] = [ num_samples, 256 ] 90 | f.attrs['name'] = name 91 | f.attrs['coordSystem'] = name_to_coordsystem(name) 92 | 93 | # https://github.com/zarr-developers/zarr-specs/issues/50 94 | f.attrs['multiscales'] = [ 95 | { 96 | "version": "0.1", 97 | "name": chr_name, 98 | "datasets": [ 99 | { "path": f"chromosomes/{chr_name}/{resolution}" } 100 | for resolution in sorted(resolutions, reverse=True) 101 | ], 102 | "type": "zarr-multivec", 103 | "metadata": { 104 | "chromoffset": int(chrom_name_to_cumsum[chr_name]), 105 | "chromsize": int(chr_len), 106 | } 107 | } 108 | for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr)) 109 | ] 110 | 111 | 112 | if __name__ == "__main__": 113 | parser = get_manifest_to_outfile_parser() 114 | args = parser.parse_args() 115 | 116 | with open(args.input) as f: 117 | manifest_json = json.load(f) 118 | input_bigwig_files = manifest_json['input_bigwig_files'] 119 | input_metadata_files = manifest_json['input_metadata_files'] 120 | 121 | bigwigs_to_zarr( 122 | input_bigwig_files, 123 | input_metadata_files, 124 | args.output, 125 | args.starting_resolution, 126 | args.name 127 | ) -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/src/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def p2f(x): 4 | if type(x) == str: 5 | if x.endswith('%'): 6 | return float(x.strip('%')) 7 | return float(x) 8 | return x 9 | 10 | def name_to_coordsystem(name): 11 | if name.startswith("Homo_sapiens"): 12 | return "hg38" 13 | if name.startswith("Mus_musculus"): 14 | return "mm10" 15 | return "unknown" 16 | 17 | def metadata_json_to_row_info(metadata_json): 18 | 19 | def dict_get(keys, expected_type): 20 | curr_d = metadata_json 21 | try: 22 | for k in keys: 23 | curr_d = curr_d[k] 24 | 25 | if curr_d == None: 26 | return None 27 | elif type(curr_d) != expected_type: 28 | try: 29 | curr_d = expected_type(curr_d) 30 | return curr_d 31 | except ValueError: 32 | return None 33 | return curr_d 34 | except KeyError: 35 | return None 36 | except IndexError: 37 | return None 38 | except TypeError: 39 | return None 40 | return None 41 | 42 | # Flatten the metadata object 43 | row_info = { 44 | "id": dict_get(["id"], str), 45 | "status": dict_get(["status"], str), 46 | "treats__0__cell_line__name": dict_get(["treats", 0, "cell_line__name"], str), 47 | "treats__0__cell_type__name": dict_get(["treats", 0, "cell_type__name"], str), 48 | "treats__0__cell_pop__name": dict_get(["treats", 0, "cell_pop__name"], str), 49 | "treats__0__disease_state__name": dict_get(["treats", 0, "disease_state__name"], str), 50 | "treats__0__factor__name": dict_get(["treats", 0, "factor__name"], str), 51 | "treats__0__is_correcting": dict_get(["treats", 0, "is_correcting"], bool), 52 | "treats__0__link": dict_get(["treats", 0, "link"], str), 53 | "treats__0__name": dict_get(["treats", 0, "name"], str), 54 | "treats__0__paper__journal__name": dict_get(["treats", 0, "paper__journal__name"], str), 55 | "treats__0__paper__lab": dict_get(["treats", 0, "paper__lab"], str), 56 | "treats__0__paper__pmid": dict_get(["treats", 0, "paper__pmid"], str), 57 | "treats__0__paper__reference": dict_get(["treats", 0, "paper__reference"], str), 58 | "treats__0__species__name": dict_get(["treats", 0, "species__name"], str), 59 | "treats__0__strain__name": dict_get(["treats", 0, "strain__name"], str), 60 | "treats__0__tissue_type__name": dict_get(["treats", 0, "tissue_type__name"], str), 61 | "treats__0__unique_id": dict_get(["treats", 0, "unique_id"], str), 62 | 63 | "qc__judge__map": dict_get(["qc", "judge", "map"], bool), 64 | "qc__judge__peaks": dict_get(["qc", "judge", "peaks"], bool), 65 | "qc__judge__fastqc": dict_get(["qc", "judge", "fastqc"], bool), 66 | "qc__judge__frip": dict_get(["qc", "judge", "frip"], bool), 67 | "qc__judge__pbc": dict_get(["qc", "judge", "pbc"], bool), 68 | "qc__judge__motif": dict_get(["qc", "judge", "motif_judge"], bool), 69 | "qc__judge__dhs": dict_get(["qc", "judge", "dhs"], bool), 70 | 71 | "qc__table__treat_number": dict_get(["qc", "table", "treat_number"], int), 72 | "qc__table__control_number": dict_get(["qc", "table", "control_number"], int), 73 | "qc__table__map__0": p2f(dict_get(["qc", "table", "map", 0], str)), 74 | "qc__table__fastqc__0": dict_get(["qc", "table", "fastqc", 0], int), 75 | "qc__table__frip__0": p2f(dict_get(["qc", "table", "frip", 0], str)), 76 | "qc__table__map_number__0": dict_get(["qc", "table", "map_number", 0], int), 77 | "qc__table__pbc__0": p2f(dict_get(["qc", "table", "pbc", 0], str)), 78 | "qc__table__motif": dict_get(["qc", "table", "motif"], bool), 79 | "qc__table__dhs": p2f(dict_get(["qc", "table", "dhs"], str)), 80 | "qc__table__raw_number__0": dict_get(["qc", "table", "raw_number", 0], int), 81 | 82 | "qc__table__meta_orig__intron": dict_get(["qc", "table", "meta_orig", "intron"], float), 83 | "qc__table__meta_orig__inter": dict_get(["qc", "table", "meta_orig", "inter"], float), 84 | "qc__table__meta_orig__exon": dict_get(["qc", "table", "meta_orig", "exon"], float), 85 | "qc__table__meta_orig__promoter": dict_get(["qc", "table", "meta_orig", "promoter"], float), 86 | } 87 | 88 | return row_info 89 | 90 | def get_manifest_to_outfile_parser(): 91 | parser = argparse.ArgumentParser(description='Create an output file by combining multiple bigwig files.') 92 | parser.add_argument('-i', '--input', type=str, required=True, help='The input manifest JSON file.') 93 | parser.add_argument('-o', '--output', type=str, required=True, help='The output file.') 94 | parser.add_argument('-s', '--starting-resolution', type=int, default=200, help='The starting resolution.') 95 | parser.add_argument('-n', '--name', type=str, required=True, help='A name to include in tileset_info.') 96 | return parser -------------------------------------------------------------------------------- /pipelines/cistrome-to-multivec/submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -c 1 # number of cores 3 | #SBATCH -N 1 # number of nodes 4 | #SBATCH -t 1-00:00 # runtime in D-HH:MM format 5 | #SBATCH -p medium # partition 6 | #SBATCH --mem=2000 # memory in MB (for all cores) 7 | #SBATCH -o snakemake-%j.out # file for STDOUT with job ID 8 | #SBATCH -e snakemake-%j.err # file for STDERR with job ID 9 | 10 | source ~/.bashrc_mark # to load the `conda activate` command 11 | 12 | conda activate cistrome-to-multivec-pipeline 13 | snakemake --profile cistrome-explorer --keep-going --config filetype=$1 user=$2 14 | -------------------------------------------------------------------------------- /pipelines/mira-data/.gitignore: -------------------------------------------------------------------------------- 1 | /data/* 2 | /mira-datasets/* 3 | .ipynb_checkpoints 4 | /output/* 5 | __pycache__ -------------------------------------------------------------------------------- /pipelines/mira-data/README.md: -------------------------------------------------------------------------------- 1 | ### Create conda environment 2 | 3 | ```sh 4 | conda env create -f environment.yml 5 | ``` 6 | 7 | ### Activate conda environment 8 | 9 | ```sh 10 | conda activate mira 11 | ``` 12 | 13 | ### Aggregate Multivec 14 | 15 | ```sh 16 | clodius aggregate multivec \ 17 | --chromsizes-filename mm10.chrom.sizes \ 18 | --starting-resolution 1000 \ 19 | ./output/e18_mouse_brain_10x_dataset_500_random_rows.hdf5 20 | ``` 21 | 22 | ### Copy File to Server 23 | 24 | ```sh 25 | rsync -avz --progress filename server:filename 26 | ``` 27 | 28 | ### Add to HiGlass Server 29 | 30 | ```sh 31 | sudo docker exec higlass-container python higlass-server/manage.py ingest_tileset \ 32 | --filename /tmp/e18_mouse_brain_10x_dataset_500_smoothed_random_rows.multires.mv5 \ 33 | --filetype multivec \ 34 | --datatype multivec \ 35 | --uid e18_mouse_brain_10x_dataset_500_smoothed_random_rows \ 36 | ``` -------------------------------------------------------------------------------- /pipelines/mira-data/cliff_code.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import warnings 3 | from scipy import sparse 4 | 5 | def _residual_transform(X, pi_j_hat, n_i): 6 | 7 | assert(isinstance(X, np.ndarray)) 8 | assert(isinstance(pi_j_hat, np.ndarray)) 9 | assert(isinstance(n_i, np.ndarray)) 10 | pi_j_hat = np.squeeze(pi_j_hat)[np.newaxis, :] 11 | n_i = np.squeeze(n_i)[:, np.newaxis] 12 | 13 | mu_ij_hat = n_i * pi_j_hat 14 | 15 | count_dif = n_i - X 16 | expected_count_dif = n_i - mu_ij_hat 17 | 18 | with warnings.catch_warnings(): 19 | warnings.simplefilter("ignore") 20 | 21 | r_ij = np.multiply( 22 | np.sign(X - mu_ij_hat), 23 | np.sqrt( 24 | np.where(X > 0, 2 * np.multiply(X, np.log(X / mu_ij_hat)), 0) + \ 25 | 2 * np.multiply(count_dif, np.log(count_dif / expected_count_dif)) 26 | ) 27 | ) 28 | 29 | return np.clip(np.nan_to_num(r_ij), -10, 10) 30 | 31 | def _get_pi(X): 32 | return np.array(X.sum(0)).reshape(-1)/X.sum() 33 | 34 | def _get_n(X): 35 | return np.array(X.sum(-1)).reshape(-1) 36 | 37 | 38 | def deviance_transform(X, subset_mask = None): 39 | ''' 40 | Parameters 41 | ---------- 42 | X : Scipy sparse matrix (N genes x N cells) 43 | Scipy sparse matrix of ATAC-seq counts (N cells x N peaks). The values may be binary or positive integers. 44 | subset_mask : np.ndarray (N cells) or none 45 | Calculating a dense version of the whole matrix is memory intensive, but the parameters of the transformation can 46 | be estimated from the whole matrix, then applied to just a subset. If "subset_mask" is *none*, the transformation 47 | will be applied to the whole matrix. If subset mask is a boolean mask of shape (N peaks), then the transformed 48 | accessibility of only those peaks will be returned. 49 | 50 | Returns 51 | ------- 52 | residuals : np.ndarray (N genes x N peaks -- subset) 53 | 54 | Example 55 | ------- 56 | 57 | If you are working with an AnnData object called "atac_data", 58 | to get the transformed values for only the first chromosome: 59 | 60 | >>> import cliff_code 61 | >>> import numpy as np 62 | >>> residuals = cliff_code.deviance_transform(atac_data.X, subset_mask = atac_data.var['chrom'].values == 'chr1') 63 | 64 | Then, for very simple KNN smoothing you can use the "connectivities" matrix in the AnnData object, which is calculated 65 | during the nearest neighbors step of the analysis: 66 | 67 | >>> smoothed = atac_data.obsp['connectivities'].dot(residuals) 68 | 69 | ''' 70 | 71 | assert sparse.isspmatrix(X) 72 | X = X.tocsr() 73 | 74 | if subset_mask is None: 75 | subset_mask = np.ones(X.shape[-1]).astype(bool) 76 | 77 | n_i = _get_n(X) 78 | p_i = _get_pi(X) 79 | X = X[:, subset_mask].toarray() 80 | 81 | residuals = _residual_transform(X, p_i[subset_mask], n_i) 82 | 83 | return residuals 84 | 85 | -------------------------------------------------------------------------------- /pipelines/mira-data/environment.yml: -------------------------------------------------------------------------------- 1 | name: mira-envi 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - pip=21.2.4 8 | - python=3.7 9 | - snakemake-minimal=7.5.0 10 | - ipykernel 11 | - pip 12 | - pip: 13 | - clodius==0.18.1 14 | - h5py==3.6.0 15 | - rich==12.3.0 16 | - mira-multiome>=1.0.1 17 | - scanpy==1.8.2 18 | - numpy==1.20.1 19 | - scikit-learn 20 | -------------------------------------------------------------------------------- /pipelines/mira-data/max_topic.js: -------------------------------------------------------------------------------- 1 | a.forEach(d => { 2 | let mi = 0; 3 | let mv = -1; 4 | for (let i = 0; i < c.length; i++) { 5 | if(mv < d[c[i]]) { 6 | mv = d[c[i]]; 7 | mi = i; 8 | } 9 | } 10 | d['max_topic'] = c[mi] 11 | }); -------------------------------------------------------------------------------- /pipelines/mira-data/mm10.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 195471971 2 | chr2 182113224 3 | chr3 160039680 4 | chr4 156508116 5 | chr5 151834684 6 | chr6 149736546 7 | chr7 145441459 8 | chr8 129401213 9 | chr9 124595110 10 | chr10 130694993 11 | chr11 122082543 12 | chr12 120129022 13 | chr13 120421639 14 | chr14 124902244 15 | chr15 104043685 16 | chr16 98207768 17 | chr17 94987271 18 | chr18 90702639 19 | chr19 61431566 20 | chrX 171031299 21 | chrY 91744698 22 | chrM 16299 23 | chr1_GL456210_random 169725 24 | chr1_GL456211_random 241735 25 | chr1_GL456212_random 153618 26 | chr1_GL456213_random 39340 27 | chr1_GL456221_random 206961 28 | chr4_GL456216_random 66673 29 | chr4_GL456350_random 227966 30 | chr4_JH584292_random 14945 31 | chr4_JH584293_random 207968 32 | chr4_JH584294_random 191905 33 | chr4_JH584295_random 1976 34 | chr5_GL456354_random 195993 35 | chr5_JH584296_random 199368 36 | chr5_JH584297_random 205776 37 | chr5_JH584298_random 184189 38 | chr5_JH584299_random 953012 39 | chr7_GL456219_random 175968 40 | chrUn_GL456239 40056 41 | chrUn_GL456359 22974 42 | chrUn_GL456360 31704 43 | chrUn_GL456366 47073 44 | chrUn_GL456367 42057 45 | chrUn_GL456368 20208 46 | chrUn_GL456370 26764 47 | chrUn_GL456372 28664 48 | chrUn_GL456378 31602 49 | chrUn_GL456379 72385 50 | chrUn_GL456381 25871 51 | chrUn_GL456382 23158 52 | chrUn_GL456383 38659 53 | chrUn_GL456385 35240 54 | chrUn_GL456387 24685 55 | chrUn_GL456389 28772 56 | chrUn_GL456390 24668 57 | chrUn_GL456392 23629 58 | chrUn_GL456393 55711 59 | chrUn_GL456394 24323 60 | chrUn_GL456396 21240 61 | chrUn_JH584304 114452 62 | chrX_GL456233_random 336933 63 | chrY_JH584300_random 182347 64 | chrY_JH584301_random 259875 65 | chrY_JH584302_random 155838 66 | chrY_JH584303_random 158099 -------------------------------------------------------------------------------- /pipelines/mira-data/process.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import numpy as np 3 | import math 4 | import h5py 5 | from torch import chunk 6 | import pandas as pd 7 | from random import sample 8 | import scanpy as sc 9 | import mira 10 | import umap 11 | import cliff_code 12 | # %% 13 | NUM_ROWS = 500 14 | RESOLUTION = 1000 15 | # %% 16 | # Reference: 17 | # https://colab.research.google.com/github/AllenWLynch/MIRA/blob/main/docs/source/notebooks/tutorial_mouse_brain.ipynb 18 | def basic_preprocess(): 19 | # Download this data by running `mira.datasets.MouseBrainDataset()` 20 | data = sc.read_h5ad('./mira-datasets/e18_10X_brain_dataset/e18_mouse_brain_10x_dataset.ad') 21 | rna_data = data[:, data.var.feature_types == 'Gene Expression'] 22 | atac_data = data[:, data.var.feature_types == 'Peaks'] 23 | 24 | # Basic preprocessing steps 25 | rna_data.var.index = rna_data.var.index.str.upper() 26 | rna_data.var_names_make_unique() 27 | rna_data = rna_data[:, ~rna_data.var.index.str.startswith('GM')] 28 | 29 | sc.pp.filter_cells(rna_data, min_counts = 400) 30 | sc.pp.filter_genes(rna_data, min_cells=15) 31 | 32 | rna_data.var['mt'] = rna_data.var_names.str.startswith('MT-') 33 | sc.pp.calculate_qc_metrics(rna_data, qc_vars=['mt'], percent_top=None, 34 | log1p=False, inplace=True) 35 | 36 | rna_data = rna_data[rna_data.obs.pct_counts_mt < 15, :] 37 | rna_data = rna_data[rna_data.obs.n_genes_by_counts < 8000, :] 38 | sc.pp.filter_genes(rna_data, min_cells=15) 39 | 40 | rna_data.raw = rna_data # save raw counts 41 | sc.pp.normalize_total(rna_data, target_sum=1e4) 42 | sc.pp.log1p(rna_data) 43 | 44 | sc.pp.highly_variable_genes(rna_data, min_disp = -0.1) 45 | rna_data.layers['norm'] = rna_data.X # save normalized count data 46 | rna_data.X = rna_data.raw.X # and reload raw counts 47 | rna_data = rna_data[:, rna_data.var.highly_variable] 48 | rna_data.var['exog_feature'] = rna_data.var.highly_variable # set column "exog_features" to all genes that met dispersion threshold 49 | rna_data.var.highly_variable = (rna_data.var.dispersions_norm > 0.8) & rna_data.var.exog_feature # set column "highly_variable" to genes that met first criteria and dispersion > 0.8 50 | 51 | overlapping_barcodes = np.intersect1d(rna_data.obs_names, atac_data.obs_names) # make sure barcodes are matched between modes 52 | rna_data = rna_data[[i for i in overlapping_barcodes],:] 53 | atac_data = atac_data[[i for i in overlapping_barcodes],:] 54 | 55 | # row info 56 | rna_model = mira.topic_model.ExpressionTopicModel.load('mira-datasets/e18_10X_brain_dataset/e18_mouse_brain_10x_rna_model.pth') 57 | atac_model = mira.topic_model.AccessibilityTopicModel.load('mira-datasets/e18_10X_brain_dataset/e18_mouse_brain_10x_atac_model.pth') 58 | 59 | rna_model.predict(rna_data) 60 | atac_model.predict(atac_data, batch_size=128) 61 | 62 | atac_model.get_umap_features(atac_data, box_cox = 0.5) 63 | rna_model.get_umap_features(rna_data, box_cox = 0.5) 64 | rna_data, atac_data = mira.utils.make_joint_representation(rna_data, atac_data) 65 | rna_model.impute(rna_data) 66 | 67 | main_barcodes = pd.read_csv("mira-datasets/e18_10X_brain_dataset/e18_mouse_brain_10x_main_barcodes.csv", index_col=0, header=0, names=["barcodes"]) 68 | 69 | # sample data randomly 70 | sampled = sample(list(main_barcodes["barcodes"]), NUM_ROWS) 71 | atac_main = atac_data[sampled] 72 | rna_main = rna_data[sampled] 73 | 74 | # nearest neighbors 75 | sc.pp.neighbors(atac_main, use_rep='X_joint_umap_features', metric='manhattan') 76 | # sc.tl.umap(atac_main, min_dist = 0.3, negative_sample_rate=5) 77 | 78 | residuals = cliff_code.deviance_transform(atac_main.X) 79 | smoothed = atac_main.obsp['connectivities'].dot(residuals) 80 | 81 | # TODO: make negative values to zero 82 | smoothed[smoothed < 0] = 0 83 | 84 | atac_main.X = smoothed 85 | 86 | to_save = atac_main.obs 87 | to_save *= 1000 88 | to_save.reset_index().to_json('output/obs.json', orient='records') 89 | 90 | rna_data = None 91 | data = None 92 | 93 | return atac_main.to_df().filter(regex='chr') 94 | # %% 95 | def anndata_to_multivec(): 96 | # https://github.com/igvteam/igv/blob/master/genomes/sizes/mm10.chrom.sizes 97 | chromSizes = [ 98 | ('chr1', 195471971), 99 | ('chr2', 182113224), 100 | ('chr3', 160039680), 101 | ('chr4', 156508116), 102 | ('chr5', 151834684), 103 | ('chr6', 149736546), 104 | ('chr7', 145441459), 105 | ('chr8', 129401213), 106 | ('chr9', 124595110), 107 | ('chr10', 130694993), 108 | ('chr11', 122082543), 109 | ('chr12', 120129022), 110 | ('chr13', 120421639), 111 | ('chr14', 124902244), 112 | ('chr15', 104043685), 113 | ('chr16', 98207768), 114 | ('chr17', 94987271), 115 | ('chr18', 90702639), 116 | ('chr19', 61431566), 117 | ('chrX', 171031299), 118 | ('chrY', 91744698), 119 | ('chrM', 16299) 120 | ] 121 | 122 | dff = basic_preprocess() 123 | 124 | # filter data for initial example 125 | # dff = df.filter(regex='chr').head(NUM_ROWS) 126 | 127 | # num_rows = len(dff.index) 128 | 129 | chromScaled = { c: math.ceil(s / RESOLUTION) for (c, s) in chromSizes } 130 | 131 | with h5py.File(f'./output/e18_mouse_brain_10x_dataset_{NUM_ROWS}_smoothed_random_rows.hdf5', "w") as f: 132 | prev_c = None 133 | for column in dff.columns: 134 | # e.g., "chr1:3060610-3061485" 135 | [c, interval] = column.split(':') 136 | [start, end] = interval.split('-') 137 | start = int(start) 138 | end = int(end) 139 | 140 | if prev_c == None: 141 | ss = chromScaled[c] 142 | density = np.zeros((ss, NUM_ROWS)) 143 | prev_c = c 144 | 145 | if c != prev_c: 146 | print(f'Storing {prev_c}') 147 | # density = density.reshape(-1, math.ceil(chromScaled[prev_c] / RESOLUTION), NUM_ROWS).sum(axis=0) 148 | f.create_dataset(name=prev_c, data=density, dtype='f', compression='gzip', chunks=True) 149 | 150 | # memory 151 | density = None 152 | f.flush() 153 | 154 | # start new 155 | ss = chromScaled[c] 156 | density = np.zeros((ss, NUM_ROWS)) 157 | 158 | prev_c = c 159 | 160 | density[math.floor(start / RESOLUTION) : math.ceil(end / RESOLUTION)] = dff[column] 161 | 162 | # %% 163 | anndata_to_multivec() 164 | # %% 165 | -------------------------------------------------------------------------------- /src/ContextMenu.jsx: -------------------------------------------------------------------------------- 1 | import React, { useEffect, useState } from 'react'; 2 | import PubSub from 'pubsub-js'; 3 | 4 | import { EVENT, CONTEXT_MENU_TYPE } from './utils/constants.js'; 5 | 6 | import './ContextMenu.scss'; 7 | 8 | export function destroyContextMenu() { 9 | PubSub.publish(EVENT.CONTEXT_MENU, { 10 | x: null, 11 | y: null, 12 | menuType: null, 13 | items: [] 14 | }); 15 | } 16 | 17 | /** 18 | * Context menu component. Subscribes to 'context-menu' event via `PubSub`. 19 | * @example 20 | * 21 | */ 22 | export default function ContextMenu() { 23 | const [left, setLeft] = useState(null); 24 | const [top, setTop] = useState(null); 25 | const [menuItemData, setMenuItemData] = useState([]); 26 | 27 | useEffect(() => { 28 | const contextMenuToken = PubSub.subscribe(EVENT.CONTEXT_MENU, (msg, data) => { 29 | setLeft(data.x); 30 | setTop(data.y); 31 | 32 | let menuData = []; 33 | switch (data.menuType) { 34 | case CONTEXT_MENU_TYPE.NOMINAL_BAR: 35 | menuData.push({ title: data.title }); 36 | menuData.push({ isSeparator: true }); 37 | menuData.push(...data.items); 38 | break; 39 | case CONTEXT_MENU_TYPE.TREE_ANCESTOR: 40 | menuData.push({ title: data.title }); 41 | menuData.push({ isSeparator: true }); 42 | menuData.push(...data.items); 43 | break; 44 | default: 45 | break; 46 | } 47 | setMenuItemData(menuData); 48 | }); 49 | 50 | return () => { 51 | PubSub.unsubscribe(contextMenuToken); 52 | }; 53 | }); 54 | 55 | // Function to generate an item for ContextMenu 56 | function ContextMenuItem(props) { 57 | const { isSeparator, key, icon, title, action } = props; 58 | 59 | return isSeparator ? ( 60 |
61 | ) : ( 62 |
71 | {icon ? ( 72 | 73 | 74 | 75 | ) : action ? ( 76 | 77 | ) : null} 78 | {title} 79 |
80 | ); 81 | } 82 | 83 | return ( 84 |
92 | {menuItemData.map((d, i) => { 93 | return ContextMenuItem({ ...d, key: i }); 94 | })} 95 |
96 | ); 97 | } 98 | -------------------------------------------------------------------------------- /src/ContextMenu.scss: -------------------------------------------------------------------------------- 1 | .hm-context-menu-container { 2 | position: fixed; 3 | min-width: 200px; 4 | background-color: #fffffff2; 5 | border: 1px solid #0000001a; 6 | border-radius: 3px; 7 | font-size: 12px; 8 | cursor: default; 9 | padding: 3px; 10 | box-shadow: 0 0 3px 0 rgba(0, 0, 0, 0.1), 0 1px 5px 0 rgba(0, 0, 0, 0.05); 11 | z-index: 1; 12 | 13 | .hm-context-menu-separator { 14 | margin-top: 5px; 15 | margin-bottom: 5px; 16 | border: 0; 17 | border-top: 1px solid rgba(0, 0, 0, 0.1); 18 | } 19 | 20 | .hm-context-menu-item, 21 | .hm-context-menu-item-title { 22 | padding: 2px; 23 | white-space: nowrap; 24 | border-radius: 2px; 25 | transition: background 0.15s ease, color 0.15s ease; 26 | } 27 | .hm-context-menu-item-title { 28 | font-weight: bold; 29 | } 30 | .hm-context-menu-item:hover { 31 | background: #337ab7; 32 | color: #fff; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/GeneExpressionSelection.jsx: -------------------------------------------------------------------------------- 1 | import { useEffect, useState } from "react"; 2 | import { AnnDataSource, ObsFeatureMatrixAnndataLoader } from '@vitessce/zarr'; 3 | import './GeneExpressionSelection.scss'; 4 | 5 | export default function GeneExpressionSelection(props) { 6 | const { 7 | left, 8 | top, 9 | width, 10 | height, 11 | url = 'https://s3.amazonaws.com/gosling-lang.org/data/cistrome/e18_mouse_brain_10x_rna_main.zarr/', 12 | genes, 13 | onGeneSelection = (genes) => { } 14 | } = props; 15 | 16 | const [metadata, setMetadata] = useState({}); 17 | const [keyword, setKeyword] = useState(''); 18 | const [selectedGenes, setSelectedGenes] = useState([...genes]); 19 | 20 | async function getMetadata() { 21 | const source = new AnnDataSource({ url }); 22 | const config = { 23 | url, 24 | fileType: 'obsFeatureMatrix.mudata.zarr', 25 | options: { 26 | path: 'X' 27 | } 28 | }; 29 | const loader = new ObsFeatureMatrixAnndataLoader(source, config); 30 | 31 | // obsIndex is cell IDs. varIndex is gene IDs. 32 | const { 33 | data: { rows: obsIndex, cols: varIndex } 34 | } = await loader.loadAttrs(); 35 | 36 | setMetadata({ 37 | obsIndex, 38 | varIndex, 39 | loader 40 | }); 41 | // We can load the data for a subset of genes by selecting an array of gene IDs. 42 | // const { data } = await loader.loadGeneSelection({ selection: ['XKR4'] }); 43 | // const expression = obsIndex.map((cellId, i) => ({ cellId, normalizedExpression: data[0][i] / 256 })); 44 | // callback(expression); 45 | } 46 | 47 | useEffect(() => { 48 | getMetadata(); 49 | }, [url]); 50 | 51 | useEffect(() => { 52 | onGeneSelection(selectedGenes); 53 | }, [selectedGenes]); 54 | 55 | return ( 56 |
65 | setKeyword(e.target.value)} 73 | placeholder="Search for genes" 74 | /> 75 |
84 |
    85 | {keyword === '' ? [] : metadata.varIndex?.filter(gene => gene.toUpperCase().includes(keyword.toUpperCase())).sort().slice(0, 30).map(gene => { 86 | return ( 87 |
  • setSelectedGenes([...selectedGenes, gene])} 90 | > 91 | {gene} 92 |
  • 93 | ); 94 | })} 95 |
96 |
97 |