├── .gitignore
├── LICENSE
├── README.md
├── contributing.md
└── scripts
├── generate-star-badges.py
└── github-markdown-toc
/.gitignore:
--------------------------------------------------------------------------------
1 | README.md.backup
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Legal Code
2 |
3 | CC0 1.0 Universal
4 |
5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12 | HEREUNDER.
13 |
14 | Statement of Purpose
15 |
16 | The laws of most jurisdictions throughout the world automatically confer
17 | exclusive Copyright and Related Rights (defined below) upon the creator
18 | and subsequent owner(s) (each and all, an "owner") of an original work of
19 | authorship and/or a database (each, a "Work").
20 |
21 | Certain owners wish to permanently relinquish those rights to a Work for
22 | the purpose of contributing to a commons of creative, cultural and
23 | scientific works ("Commons") that the public can reliably and without fear
24 | of later claims of infringement build upon, modify, incorporate in other
25 | works, reuse and redistribute as freely as possible in any form whatsoever
26 | and for any purposes, including without limitation commercial purposes.
27 | These owners may contribute to the Commons to promote the ideal of a free
28 | culture and the further production of creative, cultural and scientific
29 | works, or to gain reputation or greater distribution for their Work in
30 | part through the use and efforts of others.
31 |
32 | For these and/or other purposes and motivations, and without any
33 | expectation of additional consideration or compensation, the person
34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35 | is an owner of Copyright and Related Rights in the Work, voluntarily
36 | elects to apply CC0 to the Work and publicly distribute the Work under its
37 | terms, with knowledge of his or her Copyright and Related Rights in the
38 | Work and the meaning and intended legal effect of CC0 on those rights.
39 |
40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
41 | protected by copyright and related or neighboring rights ("Copyright and
42 | Related Rights"). Copyright and Related Rights include, but are not
43 | limited to, the following:
44 |
45 | i. the right to reproduce, adapt, distribute, perform, display,
46 | communicate, and translate a Work;
47 | ii. moral rights retained by the original author(s) and/or performer(s);
48 | iii. publicity and privacy rights pertaining to a person's image or
49 | likeness depicted in a Work;
50 | iv. rights protecting against unfair competition in regards to a Work,
51 | subject to the limitations in paragraph 4(a), below;
52 | v. rights protecting the extraction, dissemination, use and reuse of data
53 | in a Work;
54 | vi. database rights (such as those arising under Directive 96/9/EC of the
55 | European Parliament and of the Council of 11 March 1996 on the legal
56 | protection of databases, and under any national implementation
57 | thereof, including any amended or successor version of such
58 | directive); and
59 | vii. other similar, equivalent or corresponding rights throughout the
60 | world based on applicable law or treaty, and any national
61 | implementations thereof.
62 |
63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
65 | irrevocably and unconditionally waives, abandons, and surrenders all of
66 | Affirmer's Copyright and Related Rights and associated claims and causes
67 | of action, whether now known or unknown (including existing as well as
68 | future claims and causes of action), in the Work (i) in all territories
69 | worldwide, (ii) for the maximum duration provided by applicable law or
70 | treaty (including future time extensions), (iii) in any current or future
71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
72 | including without limitation commercial, advertising or promotional
73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74 | member of the public at large and to the detriment of Affirmer's heirs and
75 | successors, fully intending that such Waiver shall not be subject to
76 | revocation, rescission, cancellation, termination, or any other legal or
77 | equitable action to disrupt the quiet enjoyment of the Work by the public
78 | as contemplated by Affirmer's express Statement of Purpose.
79 |
80 | 3. Public License Fallback. Should any part of the Waiver for any reason
81 | be judged legally invalid or ineffective under applicable law, then the
82 | Waiver shall be preserved to the maximum extent permitted taking into
83 | account Affirmer's express Statement of Purpose. In addition, to the
84 | extent the Waiver is so judged Affirmer hereby grants to each affected
85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
88 | maximum duration provided by applicable law or treaty (including future
89 | time extensions), (iii) in any current or future medium and for any number
90 | of copies, and (iv) for any purpose whatsoever, including without
91 | limitation commercial, advertising or promotional purposes (the
92 | "License"). The License shall be deemed effective as of the date CC0 was
93 | applied by Affirmer to the Work. Should any part of the License for any
94 | reason be judged legally invalid or ineffective under applicable law, such
95 | partial invalidity or ineffectiveness shall not invalidate the remainder
96 | of the License, and in such case Affirmer hereby affirms that he or she
97 | will not (i) exercise any of his or her remaining Copyright and Related
98 | Rights in the Work or (ii) assert any associated claims and causes of
99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 |
102 | 4. Limitations and Disclaimers.
103 |
104 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 | surrendered, licensed or otherwise affected by this document.
106 | b. Affirmer offers the Work as-is and makes no representations or
107 | warranties of any kind concerning the Work, express, implied,
108 | statutory or otherwise, including without limitation warranties of
109 | title, merchantability, fitness for a particular purpose, non
110 | infringement, or the absence of latent or other defects, accuracy, or
111 | the present or absence of errors, whether or not discoverable, all to
112 | the greatest extent permissible under applicable law.
113 | c. Affirmer disclaims responsibility for clearing rights of other persons
114 | that may apply to the Work or any use thereof, including without
115 | limitation any person's Copyright and Related Rights in the Work.
116 | Further, Affirmer disclaims responsibility for obtaining any necessary
117 | consents, permissions or other rights required for any use of the
118 | Work.
119 | d. Affirmer understands and acknowledges that Creative Commons is not a
120 | party to this document and has no duty or obligation with respect to
121 | this CC0 or use of the Work.
122 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | **Awesome LLMOps**
2 |
3 |
4 |
5 |
6 | An awesome & curated list of the best LLMOps tools for developers.
7 |
8 | **Contribute**
9 |
10 | Contributions are most welcome, please adhere to the [contribution guidelines](contributing.md).
11 |
12 | # Table of Contents
13 |
14 | - [Table of Contents](#table-of-contents)
15 | - [Model](#model)
16 | - [Large Language Model](#large-language-model)
17 | - [CV Foundation Model](#cv-foundation-model)
18 | - [Audio Foundation Model](#audio-foundation-model)
19 | - [Serving](#serving)
20 | - [Large Model Serving](#large-model-serving)
21 | - [Frameworks/Servers for Serving](#frameworksservers-for-serving)
22 | - [Observability](#observability)
23 | - [LLMOps](#llmops)
24 | - [Search](#search)
25 | - [Vector search](#vector-search)
26 | - [Code AI](#code-ai)
27 | - [Training](#training)
28 | - [IDEs and Workspaces](#ides-and-workspaces)
29 | - [Foundation Model Fine Tuning](#foundation-model-fine-tuning)
30 | - [Frameworks for Training](#frameworks-for-training)
31 | - [Experiment Tracking](#experiment-tracking)
32 | - [Visualization](#visualization)
33 | - [Data](#data)
34 | - [Data Management](#data-management)
35 | - [Data Storage](#data-storage)
36 | - [Data Tracking](#data-tracking)
37 | - [Feature Engineering](#feature-engineering)
38 | - [Data/Feature enrichment](#datafeature-enrichment)
39 | - [Large Scale Deployment](#large-scale-deployment)
40 | - [ML Platforms](#ml-platforms)
41 | - [Workflow](#workflow)
42 | - [Scheduling](#scheduling)
43 | - [Model Management](#model-management)
44 | - [Performance](#performance)
45 | - [ML Compiler](#ml-compiler)
46 | - [Profiling](#profiling)
47 | - [AutoML](#automl)
48 | - [Optimizations](#optimizations)
49 | - [Federated ML](#federated-ml)
50 | - [Awesome Lists](#awesome-lists)
51 |
52 |
53 |
54 | # Model
55 |
56 | ## Large Language Model
57 |
58 | - [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)  - Code and documentation to train Stanford's Alpaca models, and generate the data.
59 | - [BELLE](https://github.com/LianjiaTech/BELLE)  - A 7B Large Language Model fine-tune by 34B Chinese Character Corpus, based on LLaMA and Alpaca.
60 | - [Bloom](https://github.com/bigscience-workshop/model_card)  - BigScience Large Open-science Open-access Multilingual Language Model
61 | - [dolly](https://github.com/databrickslabs/dolly)  - Databricks’ Dolly, a large language model trained on the Databricks Machine Learning Platform
62 | - [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b-instruct) - Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-40B and finetuned on a mixture of Baize. It is made available under the Apache 2.0 license.
63 | - [FastChat (Vicuna)](https://github.com/lm-sys/FastChat)  - An open platform for training, serving, and evaluating large language models. Release repo for Vicuna and FastChat-T5.
64 | - [GLM-6B (ChatGLM)](https://github.com/THUDM/ChatGLM-6B)  - An Open Bilingual Pre-Trained Model, quantization of ChatGLM-130B, can run on consumer-level GPUs.
65 | - [GLM-130B (ChatGLM)](https://github.com/THUDM/GLM-130B)  - An Open Bilingual Pre-Trained Model (ICLR 2023)
66 | - [GPT-NeoX](https://github.com/EleutherAI/gpt-neox)  - An implementation of model parallel autoregressive transformers on GPUs, based on the DeepSpeed library.
67 | - [Luotuo](https://github.com/LC1332/Luotuo-Chinese-LLM)  - A Chinese LLM, Based on LLaMA and fine tune by Stanford Alpaca, Alpaca LoRA, Japanese-Alpaca-LoRA.
68 | - [StableLM](https://github.com/Stability-AI/StableLM)  - StableLM: Stability AI Language Models
69 |
70 | **[⬆ back to ToC](#table-of-contents)**
71 |
72 | ## CV Foundation Model
73 |
74 | - [disco-diffusion](https://github.com/alembics/disco-diffusion)  - A frankensteinian amalgamation of notebooks, models and techniques for the generation of AI Art and Animations.
75 | - [midjourney](https://www.midjourney.com/home/) - Midjourney is an independent research lab exploring new mediums of thought and expanding the imaginative powers of the human species.
76 | - [segment-anything (SAM)](https://github.com/facebookresearch/segment-anything)  - produces high quality object masks from input prompts such as points or boxes, and it can be used to generate masks for all objects in an image.
77 | - [stable-diffusion](https://github.com/CompVis/stable-diffusion)  - A latent text-to-image diffusion model
78 | - [stable-diffusion v2](https://github.com/Stability-AI/stablediffusion)  - High-Resolution Image Synthesis with Latent Diffusion Models
79 |
80 | **[⬆ back to ToC](#table-of-contents)**
81 |
82 | ## Audio Foundation Model
83 |
84 | - [bark](https://github.com/suno-ai/bark)  - Bark is a transformer-based text-to-audio model created by Suno. Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects.
85 | - [whisper](https://github.com/openai/whisper)  - Robust Speech Recognition via Large-Scale Weak Supervision
86 |
87 | # Serving
88 |
89 | ## Large Model Serving
90 |
91 | - [Alpaca-LoRA-Serve](https://github.com/deep-diver/Alpaca-LoRA-Serve)  - Alpaca-LoRA as Chatbot service
92 | - [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)  - MII makes low-latency and high-throughput inference possible, powered by DeepSpeed.
93 | - [FlexGen](https://github.com/FMInference/FlexGen)  - Running large language models on a single GPU for throughput-oriented scenarios.
94 | - [Flowise](https://github.com/FlowiseAI/Flowise)  - Drag & drop UI to build your customized LLM flow using LangchainJS.
95 | - [llama.cpp](https://github.com/ggerganov/llama.cpp)  - Port of Facebook's LLaMA model in C/C++
96 | - [Modelz-LLM](https://github.com/tensorchord/modelz-llm)  - OpenAI compatible API for LLMs and embeddings (LLaMA, Vicuna, ChatGLM and many others)
97 | - [whisper.cpp](https://github.com/ggerganov/whisper.cpp)  - Port of OpenAI's Whisper model in C/C++
98 | - [x-stable-diffusion](https://github.com/stochasticai/x-stable-diffusion)  - Real-time inference for Stable Diffusion - 0.88s latency. Covers AITemplate, nvFuser, TensorRT, FlashAttention.
99 |
100 | **[⬆ back to ToC](#table-of-contents)**
101 |
102 | ## Frameworks/Servers for Serving
103 |
104 | - [BentoML](https://github.com/bentoml/BentoML)  - The Unified Model Serving Framework
105 | - [Mosec](https://github.com/mosecorg/mosec)  - A machine learning model serving framework with dynamic batching and pipelined stages, provides an easy-to-use Python interface.
106 | - [TFServing](https://github.com/tensorflow/serving)  - A flexible, high-performance serving system for machine learning models.
107 | - [Torchserve](https://github.com/pytorch/serve)  - Serve, optimize and scale PyTorch models in production
108 | - [Triton Server (TRTIS)](https://github.com/triton-inference-server/server)  - The Triton Inference Server provides an optimized cloud and edge inferencing solution.
109 | - [langchain-serve](https://github.com/jina-ai/langchain-serve)  - Serverless LLM apps on Production with Jina AI Cloud
110 |
111 | **[⬆ back to ToC](#table-of-contents)**
112 |
113 | ## Observability
114 |
115 | - [Deepchecks](https://github.com/deepchecks/deepchecks)  - Tests for Continuous Validation of ML Models & Data. Deepchecks is a Python package for comprehensively validating your machine learning models and data with minimal effort.
116 | - [Evidently](https://github.com/evidentlyai/evidently)  - Evaluate and monitor ML models from validation to production.
117 | - [Great Expectations](https://github.com/great-expectations/great_expectations)  - Always know what to expect from your data.
118 | - [whylogs](https://github.com/whylabs/whylogs)  - The open standard for data logging
119 |
120 | **[⬆ back to ToC](#table-of-contents)**
121 |
122 | # LLMOps
123 |
124 | - [Arize-Phoenix](https://github.com/Arize-ai/phoenix)  - ML observability for LLMs, vision, language, and tabular models.
125 | - [deeplake](https://github.com/activeloopai/deeplake)  - Stream large multimodal datasets to achieve near 100% GPU utilization. Query, visualize, & version control data. Access data w/o the need to recompute the embeddings for the model finetuning.
126 | - [GPTCache](https://github.com/zilliztech/GPTCache)  - Creating semantic cache to store responses from LLM queries.
127 | - [Haystack](https://github.com/deepset-ai/haystack)  - Quickly compose applications with LLM Agents, semantic search, question-answering and more.
128 | - [langchain](https://github.com/hwchase17/langchain)  - Building applications with LLMs through composability
129 | - [LangFlow](https://github.com/logspace-ai/langflow)  - An effortless way to experiment and prototype LangChain flows with drag-and-drop components and a chat interface.
130 | - [LlamaIndex](https://github.com/jerryjliu/llama_index)  - Provides a central interface to connect your LLMs with external data.
131 | - [promptfoo](https://github.com/typpo/promptfoo)  - Open-source tool for testing & evaluating prompt quality. Create test cases, automatically check output quality and catch regressions, and reduce evaluation cost.
132 | - [Weights & Biases (Prompts)](https://docs.wandb.ai/guides/prompts)- A suite of LLMOps tools within the developer-first W&B MLOps platform. Utilize W&B Prompts for visualizing and inspecting LLM execution flow, tracking inputs and outputs, viewing intermediate results, securely managing prompts and LLM chain configurations.
133 | - [xTuring](https://github.com/stochasticai/xturing)  - Build and control your personal LLMs with fast and efficient fine-tuning.
134 | - [ZenML](https://github.com/zenml-io/zenml)  - Open-source framework for orchestrating, experimenting and deploying production-grade ML solutions, with built-in `langchain` & `llama_index` integrations.
135 | - [Dify](https://github.com/langgenius/dify)  - Open-source framework aims to enable developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
136 |
137 | **[⬆ back to ToC](#table-of-contents)**
138 |
139 | # Search
140 |
141 | ## Vector search
142 |
143 | - [AquilaDB](https://github.com/Aquila-Network/AquilaDB)  - An easy to use Neural Search Engine. Index latent vectors along with JSON metadata and do efficient k-NN search.
144 | - [Chroma](https://github.com/chroma-core/chroma)  - the open source embedding database
145 | - [Jina](https://github.com/jina-ai/jina)  - Build multimodal AI services via cloud native technologies · Neural Search · Generative AI · Cloud Native
146 | - [Marqo](https://github.com/marqo-ai/marqo)  - Tensor search for humans.
147 | - [Milvus](https://github.com/milvus-io/milvus)  - Vector database for scalable similarity search and AI applications.
148 | - [Pinecone](https://www.pinecone.io/) - The Pinecone vector database makes it easy to build high-performance vector search applications. Developer-friendly, fully managed, and easily scalable without infrastructure hassles.
149 | - [pgvector](https://github.com/pgvector/pgvector)  - Open-source vector similarity search for Postgres.
150 | - [pgvecto.rs](https://github.com/tensorchord/pgvecto.rs)  - Vector database plugin for Postgres, written in Rust, specifically designed for LLM.
151 | - [Qdrant](https://github.com/qdrant/qdrant)  - Vector Search Engine and Database for the next generation of AI applications. Also available in the cloud
152 | - [txtai](https://github.com/neuml/txtai)  - Build AI-powered semantic search applications
153 | - [Vald](https://github.com/vdaas/vald)  - A Highly Scalable Distributed Vector Search Engine
154 | - [Vearch](https://github.com/vearch/vearch)  - A distributed system for embedding-based vector retrieval
155 | - [Weaviate](https://github.com/semi-technologies/weaviate)  - Weaviate is an open source vector search engine that stores both objects and vectors, allowing for combining vector search with structured filtering with the fault-tolerance and scalability of a cloud-native database, all accessible through GraphQL, REST, and various language clients.
156 |
157 | **[⬆ back to ToC](#table-of-contents)**
158 |
159 | # Code AI
160 |
161 | - [CodeGen](https://github.com/salesforce/CodeGen)  - CodeGen is an open-source model for program synthesis. Trained on TPU-v4. Competitive with OpenAI Codex.
162 | - [CodeT5](https://github.com/salesforce/CodeT5)  - Open Code LLMs for Code Understanding and Generation.
163 | - [fauxpilot](https://github.com/fauxpilot/fauxpilot)  - An open-source alternative to GitHub Copilot server
164 | - [tabby](https://github.com/TabbyML/tabby)  - Self-hosted AI coding assistant. An opensource / on-prem alternative to GitHub Copilot.
165 |
166 | # Training
167 |
168 | ## IDEs and Workspaces
169 |
170 | - [code server](https://github.com/coder/code-server)  - Run VS Code on any machine anywhere and access it in the browser.
171 | - [conda](https://github.com/conda/conda)  - OS-agnostic, system-level binary package manager and ecosystem.
172 | - [Docker](https://github.com/moby/moby)  - Moby is an open-source project created by Docker to enable and accelerate software containerization.
173 | - [envd](https://github.com/tensorchord/envd)  - 🏕️ Reproducible development environment for AI/ML.
174 | - [Jupyter Notebooks](https://github.com/jupyter/notebook)  - The Jupyter notebook is a web-based notebook environment for interactive computing.
175 | - [Kurtosis](https://github.com/kurtosis-tech/kurtosis)  - A build, packaging, and run system for ephemeral multi-container environments.
176 |
177 | **[⬆ back to ToC](#table-of-contents)**
178 |
179 | ## Foundation Model Fine Tuning
180 |
181 | - [alpaca-lora](https://github.com/tloen/alpaca-lora)  - Instruct-tune LLaMA on consumer hardware
182 | - [LMFlow](https://github.com/OptimalScale/LMFlow)  - An Extensible Toolkit for Finetuning and Inference of Large Foundation Models
183 | - [Lora](https://github.com/cloneofsimo/lora)  - Using Low-rank adaptation to quickly fine-tune diffusion models.
184 | - [peft](https://github.com/huggingface/peft)  - State-of-the-art Parameter-Efficient Fine-Tuning.
185 | - [p-tuning-v2](https://github.com/THUDM/P-tuning-v2)  - An optimized prompt tuning strategy achieving comparable performance to fine-tuning on small/medium-sized models and sequence tagging challenges. [(ACL 2022)](https://arxiv.org/abs/2110.07602)
186 | - [QLoRA](https://github.com/artidoro/qlora)  - Efficient finetuning approach that reduces memory usage enough to finetune a 65B parameter model on a single 48GB GPU while preserving full 16-bit finetuning task performance.
187 |
188 | **[⬆ back to ToC](#table-of-contents)**
189 |
190 | ## Frameworks for Training
191 |
192 | - [Accelerate](https://github.com/huggingface/accelerate)  - 🚀 A simple way to train and use PyTorch models with multi-GPU, TPU, mixed-precision.
193 | - [Apache MXNet](https://github.com/apache/mxnet)  - Lightweight, Portable, Flexible Distributed/Mobile Deep Learning with Dynamic, Mutation-aware Dataflow Dep Scheduler.
194 | - [Caffe](https://github.com/BVLC/caffe)  - A fast open framework for deep learning.
195 | - [ColossalAI](https://github.com/hpcaitech/ColossalAI)  - An integrated large-scale model training system with efficient parallelization techniques.
196 | - [DeepSpeed](https://github.com/microsoft/DeepSpeed)  - DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.
197 | - [Horovod](https://github.com/horovod/horovod)  - Distributed training framework for TensorFlow, Keras, PyTorch, and Apache MXNet.
198 | - [Jax](https://github.com/google/jax)  - Autograd and XLA for high-performance machine learning research.
199 | - [Kedro](https://github.com/kedro-org/kedro)  - Kedro is an open-source Python framework for creating reproducible, maintainable and modular data science code.
200 | - [Keras](https://github.com/keras-team/keras)  - Keras is a deep learning API written in Python, running on top of the machine learning platform TensorFlow.
201 | - [LightGBM](https://github.com/microsoft/LightGBM)  - A fast, distributed, high performance gradient boosting (GBT, GBDT, GBRT, GBM or MART) framework based on decision tree algorithms, used for ranking, classification and many other machine learning tasks.
202 | - [MegEngine](https://github.com/MegEngine/MegEngine)  - MegEngine is a fast, scalable and easy-to-use deep learning framework, with auto-differentiation.
203 | - [metric-learn](https://github.com/scikit-learn-contrib/metric-learn)  - Metric Learning Algorithms in Python.
204 | - [MindSpore](https://github.com/mindspore-ai/mindspore)  - MindSpore is a new open source deep learning training/inference framework that could be used for mobile, edge and cloud scenarios.
205 | - [Oneflow](https://github.com/Oneflow-Inc/oneflow)  - OneFlow is a performance-centered and open-source deep learning framework.
206 | - [PaddlePaddle](https://github.com/PaddlePaddle/Paddle)  - Machine Learning Framework from Industrial Practice.
207 | - [PyTorch](https://github.com/pytorch/pytorch)  - Tensors and Dynamic neural networks in Python with strong GPU acceleration.
208 | - [PyTorchLightning](https://github.com/PyTorchLightning/pytorch-lightning)  - The lightweight PyTorch wrapper for high-performance AI research. Scale your models, not the boilerplate.
209 | - [XGBoost](https://github.com/dmlc/xgboost)  - Scalable, Portable and Distributed Gradient Boosting (GBDT, GBRT or GBM) Library.
210 | - [scikit-learn](https://github.com/scikit-learn/scikit-learn)  - Machine Learning in Python.
211 | - [TensorFlow](https://github.com/tensorflow/tensorflow)  - An Open Source Machine Learning Framework for Everyone.
212 | - [VectorFlow](https://github.com/Netflix/vectorflow)  - A minimalist neural network library optimized for sparse data and single machine environments.
213 |
214 | **[⬆ back to ToC](#table-of-contents)**
215 |
216 | ## Experiment Tracking
217 |
218 | - [Aim](https://github.com/aimhubio/aim)  - an easy-to-use and performant open-source experiment tracker.
219 | - [ClearML](https://github.com/allegroai/clearml)  - Auto-Magical CI/CD to streamline your ML workflow. Experiment Manager, MLOps and Data-Management
220 | - [Guild AI](https://github.com/guildai/guildai)  - Experiment tracking, ML developer tools.
221 | - [MLRun](https://github.com/mlrun/mlrun)  - Machine Learning automation and tracking.
222 | - [Kedro-Viz](https://github.com/kedro-org/kedro-viz)  - Kedro-Viz is an interactive development tool for building data science pipelines with Kedro. Kedro-Viz also allows users to view and compare different runs in the Kedro project.
223 | - [LabNotebook](https://github.com/henripal/labnotebook)  - LabNotebook is a tool that allows you to flexibly monitor, record, save, and query all your machine learning experiments.
224 | - [Sacred](https://github.com/IDSIA/sacred)  - Sacred is a tool to help you configure, organize, log and reproduce experiments.
225 | - [Weights & Biases](https://github.com/wandb/wandb)  - A developer first, lightweight, user-friendly experiment tracking and visualization tool for machine learning projects, streamlining collaboration and simplifying MLOps. W&B excels at tracking LLM-powered applications, featuring W&B Prompts for LLM execution flow visualization, input and output monitoring, and secure management of prompts and LLM chain configurations.
226 |
227 | **[⬆ back to ToC](#table-of-contents)**
228 |
229 | ## Visualization
230 |
231 | - [Maniford](https://github.com/uber/manifold)  - A model-agnostic visual debugging tool for machine learning.
232 | - [netron](https://github.com/lutzroeder/netron)  - Visualizer for neural network, deep learning, and machine learning models.
233 | - [OpenOps](https://github.com/ThePlugJumbo/openops)  - Bring multiple data streams into one dashboard.
234 | - [TensorBoard](https://github.com/tensorflow/tensorboard)  - TensorFlow's Visualization Toolkit.
235 | - [TensorSpace](https://github.com/tensorspace-team/tensorspace)  - Neural network 3D visualization framework, build interactive and intuitive model in browsers, support pre-trained deep learning models from TensorFlow, Keras, TensorFlow.js.
236 | - [dtreeviz](https://github.com/parrt/dtreeviz)  - A python library for decision tree visualization and model interpretation.
237 | - [Zetane Viewer](https://github.com/zetane/viewer)  - ML models and internal tensors 3D visualizer.
238 | - [Zeno](https://github.com/zeno-ml/zeno)  - AI evaluation platform for interactively exploring data and model outputs.
239 |
240 | **[⬆ back to ToC](#table-of-contents)**
241 |
242 | # Data
243 |
244 | ## Data Management
245 |
246 | - [ArtiVC](https://github.com/InfuseAI/ArtiVC)  - A version control system to manage large files.
247 | Lake is a dataset format with a simple API for creating, storing, and collaborating on AI datasets of any size.
248 | - [Dolt](https://github.com/dolthub/dolt)  - Git for Data.
249 | - [DVC](https://github.com/iterative/dvc)  - Data Version Control | Git for Data & Models | ML Experiments Management.
250 | - [Delta-Lake](https://github.com/delta-io/delta)  - Storage layer that brings scalable, ACID transactions to Apache Spark and other engines.
251 | - [Pachyderm](https://github.com/pachyderm/pachyderm)  - Pachyderm is a version control system for data.
252 | - [Quilt](https://github.com/quiltdata/quilt)  - A self-organizing data hub for S3.
253 |
254 | **[⬆ back to ToC](#table-of-contents)**
255 |
256 | ## Data Storage
257 |
258 | - [JuiceFS](https://github.com/juicedata/juicefs)  - A distributed POSIX file system built on top of Redis and S3.
259 | - [LakeFS](https://github.com/treeverse/lakeFS)  - Git-like capabilities for your object storage.
260 | - [Lance](https://github.com/eto-ai/lance)  - Modern columnar data format for ML implemented in Rust.
261 |
262 | **[⬆ back to ToC](#table-of-contents)**
263 |
264 | ## Data Tracking
265 |
266 | - [Piperider](https://github.com/InfuseAI/piperider)  - A CLI tool that allows you to build data profiles and write assertion tests for easily evaluating and tracking your data's reliability over time.
267 | - [LUX](https://github.com/lux-org/lux)  - A Python library that facilitates fast and easy data exploration by automating the visualization and data analysis process.
268 |
269 | **[⬆ back to ToC](#table-of-contents)**
270 |
271 | ## Feature Engineering
272 |
273 | - [Featureform](https://github.com/featureform/featureform)  - The Virtual Feature Store. Turn your existing data infrastructure into a feature store.
274 | - [FeatureTools](https://github.com/Featuretools/featuretools)  - An open source python framework for automated feature engineering
275 |
276 | **[⬆ back to ToC](#table-of-contents)**
277 |
278 | ## Data/Feature enrichment
279 |
280 | - [Upgini](https://github.com/upgini/upgini)  - Free automated data & feature enrichment library for machine learning: automatically searches through thousands of ready-to-use features from public and community shared data sources and enriches your training dataset with only the accuracy improving features
281 | - [Feast](https://github.com/feast-dev/feast)  - An open source feature store for machine learning.
282 |
283 | **[⬆ back to ToC](#table-of-contents)**
284 |
285 | # Large Scale Deployment
286 |
287 | ## ML Platforms
288 |
289 | - [ClearML](https://github.com/allegroai/clearml)  - Auto-Magical CI/CD to streamline your ML workflow. Experiment Manager, MLOps and Data-Management.
290 | - [MLflow](https://github.com/mlflow/mlflow)  - Open source platform for the machine learning lifecycle.
291 | - [MLRun](https://github.com/mlrun/mlrun)  - An open MLOps platform for quickly building and managing continuous ML applications across their lifecycle.
292 | - [ModelFox](https://github.com/modelfoxdotdev/modelfox)  - ModelFox is a platform for managing and deploying machine learning models.
293 | - [Kserve](https://github.com/kserve/kserve)  - Standardized Serverless ML Inference Platform on Kubernetes
294 | - [Kubeflow](https://github.com/kubeflow/kubeflow)  - Machine Learning Toolkit for Kubernetes.
295 | - [PAI](https://github.com/microsoft/pai)  - Resource scheduling and cluster management for AI.
296 | - [Polyaxon](https://github.com/polyaxon/polyaxon)  - Machine Learning Management & Orchestration Platform.
297 | - [Primehub](https://github.com/InfuseAI/primehub)  - An effortless infrastructure for machine learning built on the top of Kubernetes.
298 | - [Seldon-core](https://github.com/SeldonIO/seldon-core)  - An MLOps framework to package, deploy, monitor and manage thousands of production machine learning models
299 | - [Weights & Biases](https://github.com/wandb/wandb)  - A lightweight and flexible platform for machine learning experiment tracking, dataset versioning, and model management, enhancing collaboration and streamlining MLOps workflows. W&B excels at tracking LLM-powered applications, featuring W&B Prompts for LLM execution flow visualization, input and output monitoring, and secure management of prompts and LLM chain configurations.
300 |
301 | **[⬆ back to ToC](#table-of-contents)**
302 |
303 | ## Workflow
304 |
305 | - [Airflow](https://airflow.apache.org/)  - A platform to programmatically author, schedule and monitor workflows.
306 | - [aqueduct](https://github.com/aqueducthq/aqueduct)  - An Open-Source Platform for Production Data Science
307 | - [Argo Workflows](https://github.com/argoproj/argo-workflows)  - Workflow engine for Kubernetes.
308 | - [Flyte](https://github.com/flyteorg/flyte)  - Kubernetes-native workflow automation platform for complex, mission-critical data and ML processes at scale.
309 | - [Kubeflow Pipelines](https://github.com/kubeflow/pipelines)  - Machine Learning Pipelines for Kubeflow.
310 | - [LangFlow](https://github.com/logspace-ai/langflow)  - An effortless way to experiment and prototype LangChain flows with drag-and-drop components and a chat interface.
311 | - [Metaflow](https://github.com/Netflix/metaflow)  - Build and manage real-life data science projects with ease!
312 | - [Ploomber](https://github.com/ploomber/ploomber)  - The fastest way to build data pipelines. Develop iteratively, deploy anywhere.
313 | - [Prefect](https://github.com/PrefectHQ/prefect)  - The easiest way to automate your data.
314 | - [VDP](https://github.com/instill-ai/vdp)  - An open-source unstructured data ETL tool to streamline the end-to-end unstructured data processing pipeline.
315 | - [ZenML](https://github.com/zenml-io/zenml)  - MLOps framework to create reproducible pipelines.
316 |
317 | **[⬆ back to ToC](#table-of-contents)**
318 |
319 | ## Scheduling
320 |
321 | - [Kueue](https://github.com/kubernetes-sigs/kueue)  - Kubernetes-native Job Queueing.
322 | - [PAI](https://github.com/microsoft/pai)  - Resource scheduling and cluster management for AI (Open-sourced by Microsoft).
323 | - [Slurm](https://github.com/SchedMD/slurm)  - A Highly Scalable Workload Manager.
324 | - [Volcano](https://github.com/volcano-sh/volcano)  - A Cloud Native Batch System (Project under CNCF).
325 | - [Yunikorn](https://github.com/apache/yunikorn-core)  - Light-weight, universal resource scheduler for container orchestrator systems.
326 |
327 | **[⬆ back to ToC](#table-of-contents)**
328 |
329 | ## Model Management
330 |
331 | - [dvc](https://github.com/iterative/dvc)  - Data Version Control | Git for Data & Models | ML Experiments Management
332 | - [ModelDB](https://github.com/VertaAI/modeldb)  - Open Source ML Model Versioning, Metadata, and Experiment Management
333 | - [MLEM](https://github.com/iterative/mlem)  - A tool to package, serve, and deploy any ML model on any platform.
334 | - [ormb](https://github.com/kleveross/ormb)  - Docker for Your ML/DL Models Based on OCI Artifacts
335 |
336 | **[⬆ back to ToC](#table-of-contents)**
337 |
338 | # Performance
339 |
340 | ## ML Compiler
341 |
342 | - [ONNX-MLIR](https://github.com/onnx/onnx-mlir)  - Compiler technology to transform a valid Open Neural Network Exchange (ONNX) graph into code that implements the graph with minimum runtime support.
343 | - [TVM](https://github.com/apache/tvm)  - Open deep learning compiler stack for cpu, gpu and specialized accelerators
344 |
345 | **[⬆ back to ToC](#table-of-contents)**
346 |
347 | ## Profiling
348 |
349 | - [octoml-profile](https://github.com/octoml/octoml-profile)  - octoml-profile is a python library and cloud service designed to provide the simplest experience for assessing and optimizing the performance of PyTorch models on cloud hardware with state-of-the-art ML acceleration technology.
350 | - [scalene](https://github.com/plasma-umass/scalene)  - a high-performance, high-precision CPU, GPU, and memory profiler for Python
351 |
352 | **[⬆ back to ToC](#table-of-contents)**
353 |
354 | # AutoML
355 |
356 | - [Archai](https://github.com/microsoft/archai)  - a platform for Neural Network Search (NAS) that allows you to generate efficient deep networks for your applications.
357 | - [autoai](https://github.com/blobcity/autoai)  - A framework to find the best performing AI/ML model for any AI problem.
358 | - [AutoGL](https://github.com/THUMNLab/AutoGL)  - An autoML framework & toolkit for machine learning on graphs
359 | - [AutoGluon](https://github.com/awslabs/autogluon)  - AutoML for Image, Text, and Tabular Data.
360 | - [automl-gs](https://github.com/minimaxir/automl-gs)  - Provide an input CSV and a target field to predict, generate a model + code to run it.
361 | - [autokeras](https://github.com/keras-team/autokeras)  - AutoML library for deep learning.
362 | - [Auto-PyTorch](https://github.com/automl/Auto-PyTorch)  - Automatic architecture search and hyperparameter optimization for PyTorch.
363 | - [auto-sklearn](https://github.com/automl/auto-sklearn)  - an automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator.
364 | - [Dragonfly](https://github.com/dragonfly/dragonfly)  - An open source python library for scalable Bayesian optimisation.
365 | - [Determined](https://github.com/determined-ai/determined)  - scalable deep learning training platform with integrated hyperparameter tuning support; includes Hyperband, PBT, and other search methods.
366 | - [DEvol (DeepEvolution)](https://github.com/joeddav/devol)  - a basic proof of concept for genetic architecture search in Keras.
367 | - [EvalML](https://github.com/alteryx/evalml)  - An open source python library for AutoML.
368 | - [FEDOT](https://github.com/nccr-itmo/FEDOT)  - AutoML framework for the design of composite pipelines.
369 | - [FLAML](https://github.com/microsoft/FLAML)  - Fast and lightweight AutoML ([paper](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/)).
370 | - [Goptuna](https://github.com/c-bata/goptuna)  - A hyperparameter optimization framework, inspired by Optuna.
371 | - [HpBandSter](https://github.com/automl/HpBandSter)  - a framework for distributed hyperparameter optimization.
372 | - [HPOlib2](https://github.com/automl/HPOlib2)  - a library for hyperparameter optimization and black box optimization benchmarks.
373 | - [Hyperband](https://github.com/zygmuntz/hyperband)  - open source code for tuning hyperparams with Hyperband.
374 | - [Hypernets](https://github.com/DataCanvasIO/Hypernets)  - A General Automated Machine Learning Framework.
375 | - [Hyperopt](https://github.com/hyperopt/hyperopt)  - Distributed Asynchronous Hyperparameter Optimization in Python.
376 | - [hyperunity](https://github.com/gdikov/hypertunity)  - A toolset for black-box hyperparameter optimisation.
377 | - [Katib](https://github.com/kubeflow/katib)  - Katib is a Kubernetes-native project for automated machine learning (AutoML).
378 | - [Keras Tuner](https://github.com/keras-team/keras-tuner)  - Hyperparameter tuning for humans.
379 | - [learn2learn](https://github.com/learnables/learn2learn)  - PyTorch Meta-learning Framework for Researchers.
380 | - [Ludwig](https://github.com/uber/ludwig)  - a toolbox built on top of TensorFlow that allows to train and test deep learning models without the need to write code.
381 | - [MOE](https://github.com/Yelp/MOE)  - a global, black box optimization engine for real world metric optimization by Yelp.
382 | - [Model Search](https://github.com/google/model_search)  - a framework that implements AutoML algorithms for model architecture search at scale.
383 | - [NASGym](https://github.com/gomerudo/nas-env)  - a proof-of-concept OpenAI Gym environment for Neural Architecture Search (NAS).
384 | - [NNI](https://github.com/Microsoft/nni)  - An open source AutoML toolkit for automate machine learning lifecycle, including feature engineering, neural architecture search, model compression and hyper-parameter tuning.
385 | - [Optuna](https://github.com/optuna/optuna)  - A hyperparameter optimization framework.
386 | - [Pycaret](https://github.com/pycaret/pycaret)  - An open-source, low-code machine learning library in Python that automates machine learning workflows.
387 | - [Ray Tune](github.com/ray-project/ray)  - Scalable Hyperparameter Tuning.
388 | - [REMBO](https://github.com/ziyuw/rembo)  - Bayesian optimization in high-dimensions via random embedding.
389 | - [RoBO](https://github.com/automl/RoBO)  - a Robust Bayesian Optimization framework.
390 | - [scikit-optimize(skopt)](https://github.com/scikit-optimize/scikit-optimize)  - Sequential model-based optimization with a `scipy.optimize` interface.
391 | - [Spearmint](https://github.com/HIPS/Spearmint)  - a software package to perform Bayesian optimization.
392 | - [TPOT](http://automl.info/tpot/)  - one of the very first AutoML methods and open-source software packages.
393 | - [Torchmeta](https://github.com/tristandeleu/pytorch-meta)  - A Meta-Learning library for PyTorch.
394 | - [Vegas](https://github.com/huawei-noah/vega)  - an AutoML algorithm tool chain by Huawei Noah's Arb Lab.
395 |
396 | **[⬆ back to ToC](#table-of-contents)**
397 |
398 | # Optimizations
399 |
400 | - [FeatherCNN](https://github.com/Tencent/FeatherCNN)  - FeatherCNN is a high performance inference engine for convolutional neural networks.
401 | - [Forward](https://github.com/Tencent/Forward)  - A library for high performance deep learning inference on NVIDIA GPUs.
402 | - [NCNN](https://github.com/Tencent/ncnn)  - ncnn is a high-performance neural network inference framework optimized for the mobile platform.
403 | - [PocketFlow](https://github.com/Tencent/PocketFlow)  - use AutoML to do model compression.
404 | - [TensorFlow Model Optimization](https://github.com/tensorflow/model-optimization)  - A suite of tools that users, both novice and advanced, can use to optimize machine learning models for deployment and execution.
405 | - [TNN](https://github.com/Tencent/TNN)  - A uniform deep learning inference framework for mobile, desktop and server.
406 |
407 | **[⬆ back to ToC](#table-of-contents)**
408 |
409 | # Federated ML
410 |
411 | - [EasyFL](https://github.com/EasyFL-AI/EasyFL)  - An Easy-to-use Federated Learning Platform
412 | - [FATE](https://github.com/FederatedAI/FATE)  - An Industrial Grade Federated Learning Framework
413 | - [FedML](https://github.com/FedML-AI/FedML)  - The federated learning and analytics library enabling secure and collaborative machine learning on decentralized data anywhere at any scale. Supporting large-scale cross-silo federated learning, cross-device federated learning on smartphones/IoTs, and research simulation.
414 | - [Flower](https://github.com/adap/flower)  - A Friendly Federated Learning Framework
415 | - [Harmonia](https://github.com/ailabstw/harmonia)  - Harmonia is an open-source project aiming at developing systems/infrastructures and libraries to ease the adoption of federated learning (abbreviated to FL) for researches and production usage.
416 | - [TensorFlow Federated](https://github.com/tensorflow/federated)  - A framework for implementing federated learning
417 |
418 | **[⬆ back to ToC](#table-of-contents)**
419 |
420 | # Awesome Lists
421 |
422 | - [Awesome Argo](https://github.com/terrytangyuan/awesome-argo)  - A curated list of awesome projects and resources related to Argo
423 | - [Awesome AutoDL](https://github.com/D-X-Y/Awesome-AutoDL)  - Automated Deep Learning: Neural Architecture Search Is Not the End (a curated list of AutoDL resources and an in-depth analysis)
424 | - [Awesome AutoML](https://github.com/windmaple/awesome-AutoML)  - Curating a list of AutoML-related research, tools, projects and other resources
425 | - [Awesome AutoML Papers](https://github.com/hibayesian/awesome-automl-papers)  - A curated list of automated machine learning papers, articles, tutorials, slides and projects
426 | - [Awesome Federated Learning Systems](https://github.com/AmberLJC/FLsystem-paper/blob/main/README.md) - A curated list of Federated Learning Systems related academic papers, articles, tutorials, slides and projects.
427 | - [Awesome Federated Learning](https://github.com/chaoyanghe/Awesome-Federated-Learning)  - A curated list of federated learning publications, re-organized from Arxiv (mostly)
428 | - [awesome-federated-learning](https://github.com/weimingwill/awesome-federated-learning)acc  - All materials you need for Federated Learning: blogs, videos, papers, and softwares, etc.
429 | - [Awesome Open MLOps](https://github.com/fuzzylabs/awesome-open-mlops)  - This is the Fuzzy Labs guide to the universe of free and open source MLOps tools.
430 | - [Awesome Production Machine Learning](https://github.com/EthicalML/awesome-production-machine-learning)  - A curated list of awesome open source libraries to deploy, monitor, version and scale your machine learning
431 | - [Awesome Tensor Compilers](https://github.com/merrymercy/awesome-tensor-compilers)  - A list of awesome compiler projects and papers for tensor computation and deep learning.
432 | - [kelvins/awesome-mlops](https://github.com/kelvins/awesome-mlops)  - A curated list of awesome MLOps tools.
433 | - [visenger/awesome-mlops](https://github.com/visenger/awesome-mlops)  - An awesome list of references for MLOps - Machine Learning Operations
434 | - [currentslab/awesome-vector-search](https://github.com/currentslab/awesome-vector-search)  - A curated list of awesome vector search framework/engine, library, cloud service and research papers to vector similarity search.
435 |
436 | **[⬆ back to ToC](#table-of-contents)**
437 |
--------------------------------------------------------------------------------
/contributing.md:
--------------------------------------------------------------------------------
1 | # Contribution Guidelines
2 |
3 | Please ensure your pull request adheres to the following guidelines:
4 |
5 | - New categories or improvements to the existing categorization are welcome.
6 | - Search previous suggestions before making a new one, as yours may be a duplicate.
7 | - Make an individual pull request for each suggestion.
8 | - Run `./scripts/generate-star-badges.py` to generate Github star badges if needed.
9 | - Run `./scripts/github-markdown-toc ./README.md` to generate ToC if needed.
10 | - Order link titles alphabetically within each category.
11 |
12 | Thank you for your suggestions!
13 |
--------------------------------------------------------------------------------
/scripts/generate-star-badges.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sys
4 | import shutil
5 |
6 | filename = "README.md"
7 | filename_backup = "README.md.backup"
8 |
9 |
10 | def is_link_line(line) -> bool:
11 | """Return true if the line is a link line."""
12 | if len(line) < 3 or line[0:3] != "- [":
13 | return False
14 | return True
15 |
16 | def is_github_project(line) -> bool:
17 | if "https://github.com" in line:
18 | return True
19 | return False
20 |
21 | def contains_star_badge(line) -> bool:
22 | if "https://img.shields.io/github/stars" in line:
23 | return True
24 | return False
25 |
26 |
27 | def generate_badge_link(line) -> str:
28 | first_right_middle_bracket = line.find("]")
29 | # The text should be `](https://github.com/<>/<>)`
30 | right_bracket = line[first_right_middle_bracket:].find(")") + first_right_middle_bracket
31 | project = line[first_right_middle_bracket+2+19:right_bracket]
32 | print("The project handle of this line is " + project)
33 | badge_link = " "
34 | if line[right_bracket+1] != " ":
35 | badge_link += " "
36 | newline = line[:right_bracket+1] + badge_link + line[right_bracket+1:]
37 | print("The new line is " + newline)
38 | return newline
39 |
40 |
41 | def generate_star_badge(line) -> str:
42 | """Add the GitHub star badge if it does not exist."""
43 | if not is_link_line(line) or not is_github_project(line):
44 | "Return other lines unchanged."
45 | return line
46 | if contains_star_badge(line):
47 | return line
48 | print("This line does not contain the star badge: " + line)
49 | return generate_badge_link(line)
50 |
51 |
52 | def main() -> int:
53 | """Echo the input arguments to standard output"""
54 | lines = []
55 | with open(filename, "r") as f:
56 | for line in f:
57 | lines.append(generate_star_badge(line))
58 | shutil.copyfile(filename, filename_backup)
59 | with open(filename, "w") as f:
60 | for line in lines:
61 | f.write(line)
62 | return 0
63 |
64 |
65 | if __name__ == '__main__':
66 | sys.exit(main()) # next section explains the use of sys.exit
67 |
--------------------------------------------------------------------------------
/scripts/github-markdown-toc:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # Steps:
5 | #
6 | # 1. Download corresponding html file for some README.md:
7 | # curl -s $1
8 | #
9 | # 2. Discard rows where no substring 'user-content-' (github's markup):
10 | # awk '/user-content-/ { ...
11 | #
12 | # 3.1 Get last number in each row like ' ... sitemap.js.*<\/h/)+2, RLENGTH-5)
21 | #
22 | # 5. Find anchor and insert it inside "(...)":
23 | # substr($0, match($0, "href=\"[^\"]+?\" ")+6, RLENGTH-8)
24 | #
25 |
26 | gh_toc_version="0.8.0"
27 |
28 | gh_user_agent="gh-md-toc v$gh_toc_version"
29 |
30 | #
31 | # Download rendered into html README.md by its url.
32 | #
33 | #
34 | gh_toc_load() {
35 | local gh_url=$1
36 |
37 | if type curl &>/dev/null; then
38 | curl --user-agent "$gh_user_agent" -s "$gh_url"
39 | elif type wget &>/dev/null; then
40 | wget --user-agent="$gh_user_agent" -qO- "$gh_url"
41 | else
42 | echo "Please, install 'curl' or 'wget' and try again."
43 | exit 1
44 | fi
45 | }
46 |
47 | #
48 | # Converts local md file into html by GitHub
49 | #
50 | # -> curl -X POST --data '{"text": "Hello world github/linguist#1 **cool**, and #1!"}' https://api.github.com/markdown
51 | #
Hello world github/linguist#1 cool, and #1!
'" 52 | gh_toc_md2html() { 53 | local gh_file_md=$1 54 | local skip_header=$2 55 | 56 | URL=https://api.github.com/markdown/raw 57 | 58 | if [ ! -z "$GH_TOC_TOKEN" ]; then 59 | TOKEN=$GH_TOC_TOKEN 60 | else 61 | TOKEN_FILE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt" 62 | if [ -f "$TOKEN_FILE" ]; then 63 | TOKEN="$(cat $TOKEN_FILE)" 64 | fi 65 | fi 66 | if [ ! -z "${TOKEN}" ]; then 67 | AUTHORIZATION="Authorization: token ${TOKEN}" 68 | fi 69 | 70 | local gh_tmp_file_md=$gh_file_md 71 | if [ "$skip_header" = "yes" ]; then 72 | if grep -Fxq "" $gh_src; then 73 | # cut everything before the toc 74 | gh_tmp_file_md=$gh_file_md~~ 75 | sed '1,//d' $gh_file_md > $gh_tmp_file_md 76 | fi 77 | fi 78 | 79 | # echo $URL 1>&2 80 | OUTPUT=$(curl -s \ 81 | --user-agent "$gh_user_agent" \ 82 | --data-binary @"$gh_tmp_file_md" \ 83 | -H "Content-Type:text/plain" \ 84 | -H "$AUTHORIZATION" \ 85 | "$URL") 86 | 87 | rm -f $gh_file_md~~ 88 | 89 | if [ "$?" != "0" ]; then 90 | echo "XXNetworkErrorXX" 91 | fi 92 | if [ "$(echo "${OUTPUT}" | awk '/API rate limit exceeded/')" != "" ]; then 93 | echo "XXRateLimitXX" 94 | else 95 | echo "${OUTPUT}" 96 | fi 97 | } 98 | 99 | 100 | # 101 | # Is passed string url 102 | # 103 | gh_is_url() { 104 | case $1 in 105 | https* | http*) 106 | echo "yes";; 107 | *) 108 | echo "no";; 109 | esac 110 | } 111 | 112 | # 113 | # TOC generator 114 | # 115 | gh_toc(){ 116 | local gh_src=$1 117 | local gh_src_copy=$1 118 | local gh_ttl_docs=$2 119 | local need_replace=$3 120 | local no_backup=$4 121 | local no_footer=$5 122 | local indent=$6 123 | local skip_header=$7 124 | 125 | if [ "$gh_src" = "" ]; then 126 | echo "Please, enter URL or local path for a README.md" 127 | exit 1 128 | fi 129 | 130 | 131 | # Show "TOC" string only if working with one document 132 | if [ "$gh_ttl_docs" = "1" ]; then 133 | 134 | echo "Table of Contents" 135 | echo "=================" 136 | echo "" 137 | gh_src_copy="" 138 | 139 | fi 140 | 141 | if [ "$(gh_is_url "$gh_src")" == "yes" ]; then 142 | gh_toc_load "$gh_src" | gh_toc_grab "$gh_src_copy" "$indent" 143 | if [ "${PIPESTATUS[0]}" != "0" ]; then 144 | echo "Could not load remote document." 145 | echo "Please check your url or network connectivity" 146 | exit 1 147 | fi 148 | if [ "$need_replace" = "yes" ]; then 149 | echo 150 | echo "!! '$gh_src' is not a local file" 151 | echo "!! Can't insert the TOC into it." 152 | echo 153 | fi 154 | else 155 | local rawhtml=$(gh_toc_md2html "$gh_src" "$skip_header") 156 | if [ "$rawhtml" == "XXNetworkErrorXX" ]; then 157 | echo "Parsing local markdown file requires access to github API" 158 | echo "Please make sure curl is installed and check your network connectivity" 159 | exit 1 160 | fi 161 | if [ "$rawhtml" == "XXRateLimitXX" ]; then 162 | echo "Parsing local markdown file requires access to github API" 163 | echo "Error: You exceeded the hourly limit. See: https://developer.github.com/v3/#rate-limiting" 164 | TOKEN_FILE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt" 165 | echo "or place GitHub auth token here: ${TOKEN_FILE}" 166 | exit 1 167 | fi 168 | local toc=`echo "$rawhtml" | gh_toc_grab "$gh_src_copy" "$indent"` 169 | echo "$toc" 170 | if [ "$need_replace" = "yes" ]; then 171 | if grep -Fxq "" $gh_src && grep -Fxq "" $gh_src; then 172 | echo "Found markers" 173 | else 174 | echo "You don't have or in your file...exiting" 175 | exit 1 176 | fi 177 | local ts="<\!--ts-->" 178 | local te="<\!--te-->" 179 | local dt=`date +'%F_%H%M%S'` 180 | local ext=".orig.${dt}" 181 | local toc_path="${gh_src}.toc.${dt}" 182 | local toc_createdby="" 183 | local toc_footer="" 184 | # http://fahdshariff.blogspot.ru/2012/12/sed-mutli-line-replacement-between-two.html 185 | # clear old TOC 186 | sed -i${ext} "/${ts}/,/${te}/{//!d;}" "$gh_src" 187 | # create toc file 188 | echo "${toc}" > "${toc_path}" 189 | if [ "${no_footer}" != "yes" ]; then 190 | echo -e "\n${toc_createdby}\n${toc_footer}\n" >> "$toc_path" 191 | fi 192 | 193 | # insert toc file 194 | if ! sed --version > /dev/null 2>&1; then 195 | sed -i "" "/${ts}/r ${toc_path}" "$gh_src" 196 | else 197 | sed -i "/${ts}/r ${toc_path}" "$gh_src" 198 | fi 199 | echo 200 | if [ "${no_backup}" = "yes" ]; then 201 | rm ${toc_path} ${gh_src}${ext} 202 | fi 203 | echo "!! TOC was added into: '$gh_src'" 204 | if [ -z "${no_backup}" ]; then 205 | echo "!! Origin version of the file: '${gh_src}${ext}'" 206 | echo "!! TOC added into a separate file: '${toc_path}'" 207 | fi 208 | echo 209 | fi 210 | fi 211 | } 212 | 213 | # 214 | # Grabber of the TOC from rendered html 215 | # 216 | # $1 - a source url of document. 217 | # It's need if TOC is generated for multiple documents. 218 | # $2 - number of spaces used to indent. 219 | # 220 | gh_toc_grab() { 221 | common_awk_script=' 222 | modified_href = "" 223 | split(href, chars, "") 224 | for (i=1;i <= length(href); i++) { 225 | c = chars[i] 226 | res = "" 227 | if (c == "+") { 228 | res = " " 229 | } else { 230 | if (c == "%") { 231 | res = "\\x" 232 | } else { 233 | res = c "" 234 | } 235 | } 236 | modified_href = modified_href res 237 | } 238 | print sprintf("%*s", (level-1)*'"$2"', "") "* [" text "](" gh_url modified_href ")" 239 | ' 240 | if [ `uname -s` == "OS/390" ]; then 241 | grepcmd="pcregrep -o" 242 | echoargs="" 243 | awkscript='{ 244 | level = substr($0, length($0), 1) 245 | text = substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5) 246 | href = substr($0, match($0, "href=\"([^\"]+)?\"")+6, RLENGTH-7) 247 | '"$common_awk_script"' 248 | }' 249 | else 250 | grepcmd="grep -Eo" 251 | echoargs="-e" 252 | awkscript='{ 253 | level = substr($0, length($0), 1) 254 | text = substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5) 255 | href = substr($0, match($0, "href=\"[^\"]+?\"")+6, RLENGTH-7) 256 | '"$common_awk_script"' 257 | }' 258 | fi 259 | href_regex='href=\"[^\"]+?\"' 260 | 261 | # if closedfoo1
264 | #
265 | # became: The command foo1
266 | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n<\/h/<\/h/g' |
267 |
268 | # find strings that corresponds to template
269 | $grepcmd '