├── .gitignore ├── CODE_Of_CONDUCT.md ├── CONTRIBUTING.md ├── DSA-master └── Day-1 │ ├── pascalTraingleRecursive.py │ └── setMatrixZero.py ├── DataScience ├── GeneralMLPrep.md ├── LLMPrep.md ├── MlAlgoCheatSheet.md ├── MlAlgoKeyFormulae.md └── study_plan.md ├── GenerativeAI ├── 1.1.What does generative truly mean.md ├── 1.2 Next Word Prediction.md ├── 1.3 Embedding Process. - Mathematical Intution.md ├── 1.4 Attention Block - Python Example.md ├── 1.5 MLP Block - Python Example.md ├── 1.6 Positional Encoding - Python Example.md ├── 1.7 End to End process of Attention.md ├── 2. How does FAISS work.md ├── 3. FAISS Advanced explaination.md ├── 4. Transformers and Vector DB Interview prep.md ├── 5. FAISS interview prep.md ├── 6.RecursiveReferenceRAG.md └── References.md ├── Interview_Questions.md ├── LICENSE ├── Python_Programming_Quiz.md ├── README.md ├── adjacentElementProduct.py ├── atoi.py ├── binary_search_recursive.py ├── bits_wilp ├── Ex2_Numpy_Q1.ipynb ├── Ex2_Numpy_Q2.ipynb ├── Ex2_Numpy_Q3.ipynb ├── Ex2_Numpy_Q4.ipynb ├── Ex2_Numpy_Q5.ipynb ├── Ex2_Numpy_Q6.ipynb ├── Ex2_Pandas_DataViz.ipynb ├── Quiz 1_ S2-20_DSECLPFDS.pdf ├── binomialCoefficient.py ├── calculateFrequency.py ├── isAngstrom.py ├── isPalindrome.py ├── practice.py ├── primeFactorization.py ├── sample.txt ├── searching.py ├── sumOfDigits.py └── topThreeFrequent.py ├── bresenham_line_algorithm.py ├── bst_nodes_in_range.py ├── bubble_sort.py ├── calculateClockAngle.py ├── check_anagrams.py ├── check_semiprime.py ├── data_science_interviews.md ├── dfs_bfs.py ├── diameterOfTree.py ├── estimate_pi.py ├── find_k_largest.py ├── find_m_to_last_llist.py ├── find_pairs_sum_k.py ├── find_products_pair_k.py ├── find_pythagoras_triplet.py ├── find_second_largest_in_binary_tree.py ├── first_n_fibo.py ├── first_non_repeating.py ├── first_recurring_character.py ├── first_unique_letter.py ├── gamblers_ruin.py ├── gen_largest_num_frm_list.py ├── general_tree_structure.py ├── getMinPlatforms.py ├── get_dup_chars.py ├── hasZeroSumSubArray.py ├── has_only_digits.py ├── haversine.py ├── heap_structure.py ├── hundred_without_int.py ├── interger_to_roman_num.py ├── intersection_arrays.py ├── isMatrixSymmetric.py ├── is_anagram.py ├── is_anagram_using_collections.py ├── is_num_palindrome.py ├── is_numeric.py ├── josephus.py ├── josephus_improved.py ├── josephus_improved_v3.py ├── karatsuba.py ├── level_order_tree.py ├── linked_list_data_structure.py ├── loop_in_linkedlist.py ├── lowest_common_ancestor.py ├── majority_element.py ├── max_in_array.py ├── maximum_subarray_sum.py ├── merge_sort.py ├── min_max_array_oneLoop.py ├── move_zeros_to_end.py ├── no_sibling_tree.py ├── oddAscEvenDesc.py ├── pascal_triangle.py ├── pascals_triangle_improved.py ├── permutations.py ├── permute_strings.py ├── preorder_iterative_bst.py ├── priority_queue_simple.py ├── processStringToDict.py ├── product_puzzle.py ├── queue_data_structure.py ├── quick_sort.py ├── range_fn_float.py ├── remove_chars.py ├── remove_dup_chars.py ├── remove_duplicates.py ├── remove_duplicates_v2.py ├── reverse_in_place.py ├── reverse_str_recursive.py ├── reverse_words.py ├── rotateMatrix180Deg.py ├── rotate_matrix.py ├── running_median_integers.py ├── search_unique.py ├── selection_sort.py ├── signOfProduct.py ├── stack_data_structure.py ├── stock_span.py ├── sum_array_recursion.py ├── timeseries.py ├── union_arrays.py └── username_validation.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /CODE_Of_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at singhal.amogh1995@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via issue, 4 | email, or any other method with the owners of this repository before making a change. 5 | 6 | Please note we have a code of conduct, please follow it in all your interactions with the project. 7 | 8 | ## Pull Request Process 9 | 10 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a 11 | build. 12 | 2. Update the README.md with details of changes to the interface, this includes new environment 13 | variables, exposed ports, useful file locations and container parameters. 14 | 3. Increase the version numbers in any examples files and the README.md to the new version that this 15 | Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/). 16 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you 17 | do not have permission to do that, you may request the second reviewer to merge it for you. 18 | 19 | ## Code of Conduct 20 | 21 | ### Our Pledge 22 | 23 | In the interest of fostering an open and welcoming environment, we as 24 | contributors and maintainers pledge to making participation in our project and 25 | our community a harassment-free experience for everyone, regardless of age, body 26 | size, disability, ethnicity, gender identity and expression, level of experience, 27 | nationality, personal appearance, race, religion, or sexual identity and 28 | orientation. 29 | 30 | ### Our Standards 31 | 32 | Examples of behavior that contributes to creating a positive environment 33 | include: 34 | 35 | * Using welcoming and inclusive language 36 | * Being respectful of differing viewpoints and experiences 37 | * Gracefully accepting constructive criticism 38 | * Focusing on what is best for the community 39 | * Showing empathy towards other community members 40 | 41 | Examples of unacceptable behavior by participants include: 42 | 43 | * The use of sexualized language or imagery and unwelcome sexual attention or 44 | advances 45 | * Trolling, insulting/derogatory comments, and personal or political attacks 46 | * Public or private harassment 47 | * Publishing others' private information, such as a physical or electronic 48 | address, without explicit permission 49 | * Other conduct which could reasonably be considered inappropriate in a 50 | professional setting 51 | 52 | ### Our Responsibilities 53 | 54 | Project maintainers are responsible for clarifying the standards of acceptable 55 | behavior and are expected to take appropriate and fair corrective action in 56 | response to any instances of unacceptable behavior. 57 | 58 | Project maintainers have the right and responsibility to remove, edit, or 59 | reject comments, commits, code, wiki edits, issues, and other contributions 60 | that are not aligned to this Code of Conduct, or to ban temporarily or 61 | permanently any contributor for other behaviors that they deem inappropriate, 62 | threatening, offensive, or harmful. 63 | 64 | ### Scope 65 | 66 | This Code of Conduct applies both within project spaces and in public spaces 67 | when an individual is representing the project or its community. Examples of 68 | representing a project or community include using an official project e-mail 69 | address, posting via an official social media account, or acting as an appointed 70 | representative at an online or offline event. Representation of a project may be 71 | further defined and clarified by project maintainers. 72 | 73 | ### Enforcement 74 | 75 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 76 | reported by contacting the project team at [INSERT EMAIL ADDRESS]. All 77 | complaints will be reviewed and investigated and will result in a response that 78 | is deemed necessary and appropriate to the circumstances. The project team is 79 | obligated to maintain confidentiality with regard to the reporter of an incident. 80 | Further details of specific enforcement policies may be posted separately. 81 | 82 | Project maintainers who do not follow or enforce the Code of Conduct in good 83 | faith may face temporary or permanent repercussions as determined by other 84 | members of the project's leadership. 85 | 86 | ### Attribution 87 | 88 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 89 | available at [http://contributor-covenant.org/version/1/4][version] 90 | 91 | [homepage]: http://contributor-covenant.org 92 | [version]: http://contributor-covenant.org/version/1/4/ 93 | -------------------------------------------------------------------------------- /DSA-master/Day-1/pascalTraingleRecursive.py: -------------------------------------------------------------------------------- 1 | def computeCoeff(row, col): 2 | """ 3 | This method computes the Binomial coefficient for each point in the Pascal Triangle 4 | """ 5 | if col == 0 or row == col: 6 | return 1 # for the corners of each row 7 | else: 8 | return computeCoeff(row-1, col) + computeCoeff(row-1, col-1) # take the numbers in previous row same column and one column left of that number 9 | 10 | def printTriangle(n): 11 | """ 12 | This method prints the Pascal triangle with `n` rows 13 | """ 14 | for r in range(n): 15 | for c in range(r+1): 16 | print(computeCoeff(r,c), end=' ') 17 | print('\n') 18 | 19 | printTriangle(10) 20 | 21 | # Output 22 | """ 23 | 1 24 | 25 | 1 1 26 | 27 | 1 2 1 28 | 29 | 1 3 3 1 30 | 31 | 1 4 6 4 1 32 | 33 | 1 5 10 10 5 1 34 | 35 | 1 6 15 20 15 6 1 36 | 37 | 1 7 21 35 35 21 7 1 38 | 39 | 1 8 28 56 70 56 28 8 1 40 | 41 | 1 9 36 84 126 126 84 36 9 1 42 | """ 43 | -------------------------------------------------------------------------------- /DSA-master/Day-1/setMatrixZero.py: -------------------------------------------------------------------------------- 1 | # Set Matrix Zero 2 | # Problem Statement: Given a matrix if an element in the matrix is 0 3 | # then you will have to set its entire column and row to 0 and then 4 | # return the matrix. 5 | 6 | # Input: matrix=[[1,1,1],[1,0,1],[1,1,1]] 7 | # Output: [[1,0,1],[0,0,0],[1,0,1]] 8 | 9 | def getZeros(matrix, shape): 10 | """ 11 | returns the location of zeros in the matrix 12 | as a list of tuples 13 | Time Complexity: O(M*N) where MxN is the shape of matrix 14 | """ 15 | r,c = shape 16 | zeros = [] 17 | for i in range(0,r): 18 | for j in range(0,c): 19 | if matrix[i][j] == 0: 20 | zeros.append((i,j)) 21 | return zeros 22 | 23 | def setZeros(matrix, shape, zeros): 24 | """ 25 | returns the modified matrix 26 | Time complexity: O(M+N) where MxN is the shape of the matrix 27 | """ 28 | r,c = shape 29 | 30 | for z in zeros: 31 | m,n = z 32 | for i in range(0,r): 33 | matrix[i][n] = 0 34 | 35 | for j in range(0,c): 36 | matrix[m][j] = 0 37 | 38 | return matrix 39 | 40 | 41 | 42 | M = [[0,1,2,0],[3,4,5,2],[1,3,1,5]] 43 | size_mat = (len(M), len(M[0])) 44 | 45 | print("Original Matrix: ", M) 46 | print("Shape of the matrix: ",size_mat) 47 | 48 | zeroLocs = getZeros(M, size_mat) 49 | print("Zeros found at:", zeroLocs) 50 | 51 | M_z = setZeros(M, size_mat, zeroLocs) 52 | print("Modified matrix is: ", M_z) 53 | -------------------------------------------------------------------------------- /DataScience/GeneralMLPrep.md: -------------------------------------------------------------------------------- 1 | CNN 2 | ========== 3 | * CNN are deep learning architectures that are primarily used for processing image data. 4 | * The special operation known as Convolution helps them extract features like edges and textures, in combination with filters. 5 | * ReLU is applied as a activation function to add non linearity 6 | * Pooling is perfomed to reduce the spatial dimensions while retainign important information. This is helpful in computational load and controlling overfitting 7 | * Fully Connected Layer (FCL) , after several convolution and pooling operations, the output is passes through a FCL to generate class probabilties needed for classification. 8 | 9 | How CNNs Work: 10 | ========== 11 | * The input image is transformed into a numerical representation, where each pixel is assigned a value based on its intensity. 12 | * The convolution operation involves sliding the filter across the image and performing element-wise multiplication, followed by summation to create a feature map. 13 | * As data progresses through multiple layers, CNNs learn increasingly complex features, from simple edges in early layers to intricate shapes in deeper layers. 14 | 15 | Applications: 16 | ========== 17 | CNNs are widely used in various fields such as: 18 | * Image Recognition: Identifying objects in images (e.g., facial recognition). 19 | * Medical Image Analysis: Analyzing X-rays or MRIs for diagnostic purposes. 20 | * Autonomous Vehicles: Object detection and scene understanding. 21 | 22 | RNN 23 | ========== 24 | * RNN are a class of nerual networks that are excellent at processing sequential data 25 | * They maintain an internal state at time step `t` for input `x(t)`, and combines it with hidden state from the previous step `h(t-1)` to produce a new hidden state `h(t)` 26 | * h (t) = f [ W(h) * h(t−1) + W(x) * x(t) + b], W(h) and W(x) are weight matrics and b is the bias term, f is the activation function 27 | 28 | Applications: 29 | ========== 30 | RNNs are commonly used in: 31 | * Natural Language Processing: Tasks such as language modeling, text generation, and sentiment analysis. 32 | * Speech Recognition: Processing audio signals to convert speech into text. 33 | * Time Series Prediction: Forecasting stock prices or weather conditions based on historical data. 34 | 35 | Decision Tree 36 | ========== 37 | * Decision tree is a supervised ML algorithm used in classification and regression taks 38 | * It is able to model decision and possible consequences in the form of a tree like strcuture 39 | * The branch represents a `decision rule` and the internal node represents a `feature`. The leaf node or the terminal node of the branch is the `outcome` 40 | 41 | Building a Decision Tree: 42 | ========== 43 | DEINR (pronounced as "Diner") : Data; Entropy; InformationGain ; NodeSeletion; RecursiveSplitting 44 | * Data Input: Start with the entire dataset. 45 | * Entropy Calculation: Calculate the entropy of the target variable and predictor attributes to measure impurity. 46 | * Information Gain: Determine the information gain for each attribute to identify which feature best splits the data. 47 | * Node Selection: Choose the attribute with the highest information gain as the root node. 48 | * Recursive Splitting: Repeat this process recursively for each branch until all branches are finalized or a *stopping criterion is met (e.g., maximum depth or minimum samples per leaf)* 49 | 50 | Advantages: 51 | ========== 52 | * Easy to interpret and visualize. 53 | * Requires little data preprocessing (no need for normalization). 54 | * Can handle both numerical and categorical data. 55 | 56 | Disadvantages: 57 | ============ 58 | * Prone to overfitting, especially with deep trees. 59 | * Sensitive to small variations in data. 60 | 61 | Random Forest 62 | ========== 63 | * Random Forest is an ensermble technique, that combines multiple decision trees 64 | * It mitigates overfitting by averaging the results of many tree, which indivudually may have high variance 65 | 66 | Building a Random Forest: 67 | ========== 68 | BTA (pronounced as "beta"): BootStrapSampling; TreeConstruction; Aggregation 69 | * Bootstrap Sampling: Randomly select subsets of the training data with replacement to create multiple datasets. 70 | * Tree Construction: For each subset, build a decision tree using a random selection of features at each split. 71 | * Aggregation: During prediction, aggregate the results from all trees (e.g., majority vote for classification or average for regression) 72 | 73 | Advantages: 74 | ========== 75 | * Reduces overfitting compared to individual decision trees. 76 | * Handles large datasets with higher dimensionality well. 77 | * Provides feature importance scores. 78 | 79 | Disadvantages: 80 | ========== 81 | * More complex and less interpretable than single decision trees. 82 | * Requires more computational resources. 83 | 84 | Bagging or (B)ootstrap (Agg)regating 85 | ==================================== 86 | * This is an ensemble technique aimed at improving the accuracy and stability of ML models 87 | * It is done by combining multiple models trained on different subsets of the training data 88 | 89 | How Bagging Works: 90 | =============== 91 | * Multiple Samples: Generate multiple bootstrap samples from the original dataset. 92 | * Model Training: Train a separate model (e.g., decision tree) on each bootstrap sample. 93 | * Final Prediction: Aggregate predictions from all models (e.g., majority voting for classification, averaging for regression) 94 | 95 | Advantages: 96 | ========== 97 | * Reduces variance and helps prevent overfitting. 98 | * Improves model robustness against noise in data. 99 | 100 | Disadvantages: 101 | ================= 102 | * May not significantly improve performance if base learners are not diverse. 103 | 104 | Boosting 105 | ==================================== 106 | * This is an ensemble technique aimed at improving the accuracy and stability of ML models 107 | * It is done by combining weak learners(models that perfrom slightly better than random chance) to create a strong learner 108 | * The strong learner is built in iterations with focus on misclassified instances. 109 | 110 | How Boosting Works: 111 | =============== 112 | * Sequential Learning: Models are trained sequentially, where each new model focuses on correcting errors made by previous models. 113 | * Weight Adjustment: Misclassified instances are given higher weights so that subsequent models pay more attention to them. 114 | * Final Prediction: Combine predictions from all models, typically using weighted voting or averaging 115 | 116 | Popular Boosting Algorithms: 117 | ========== 118 | * AdaBoost 119 | * Gradient Boosting 120 | * XGBoost 121 | 122 | Advantages: 123 | ========== 124 | * Often achieves high accuracy and performs well even with limited data. 125 | * Can handle various types of data and relationships. 126 | 127 | Disadvantages: 128 | ================= 129 | * More prone to overfitting than bagging if not carefully tuned. 130 | * Requires careful tuning of parameters. 131 | -------------------------------------------------------------------------------- /DataScience/LLMPrep.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DataScience/MlAlgoCheatSheet.md: -------------------------------------------------------------------------------- 1 | ## Popular Algorithms in Data Science 2 | 3 | In data science, various algorithms are employed for tasks such as regression and classification. Each algorithm has associated loss functions and performance metrics that help evaluate its effectiveness. Below is a detailed overview of popular algorithms, their loss functions, performance metrics, and caveats for their use. 4 | 5 | ### 1. **Linear Regression** 6 | - **Loss Function:** Mean Squared Error (MSE) 7 | - **Performance Metrics:** R-squared (R²), Adjusted R², Mean Absolute Error (MAE) 8 | - **Caveats:** Sensitive to outliers; performs poorly when the relationship between features and target is non-linear. 9 | 10 | ### 2. **Logistic Regression** 11 | - **Loss Function:** Binary Cross-Entropy Loss (Log Loss) 12 | - **Performance Metrics:** Accuracy, Precision, Recall, F1 Score 13 | - **Caveats:** Assumes linearity between the independent variables and the log odds of the dependent variable; not suitable for multi-class problems without modification. 14 | 15 | ### 3. **Decision Trees** 16 | - **Loss Function:** Gini Impurity (for classification), Mean Squared Error (for regression) 17 | - **Performance Metrics:** Accuracy, Mean Absolute Error (MAE), Root Mean Squared Error (RMSE) 18 | - **Caveats:** Prone to overfitting; sensitive to small changes in data which can lead to different tree structures. 19 | 20 | ### 4. **Support Vector Machines (SVM)** 21 | - **Loss Function:** Hinge Loss (for classification), Epsilon-insensitive Loss (for regression) 22 | - **Performance Metrics:** Accuracy, Precision, Recall 23 | - **Caveats:** Computationally expensive for large datasets; requires careful tuning of hyperparameters like the kernel choice. 24 | 25 | ### 5. **Random Forest** 26 | - **Loss Function:** Mean Squared Error (for regression), Gini Impurity or Cross-Entropy Loss (for classification) 27 | - **Performance Metrics:** Out-of-Bag Error, Accuracy 28 | - **Caveats:** Can be less interpretable than simpler models; may require significant computational resources. 29 | 30 | ### 6. **Gradient Boosting Machines (GBM)** 31 | - **Loss Function:** Log Loss (for classification), Mean Squared Error (for regression) 32 | - **Performance Metrics:** Log-Likelihood, RMSE 33 | - **Caveats:** Sensitive to overfitting if not properly regularized; requires careful tuning of learning rate and tree depth. 34 | 35 | ### 7. **Neural Networks** 36 | - **Loss Function:** Cross-Entropy Loss (for classification), Mean Squared Error (for regression) 37 | - **Performance Metrics:** Accuracy, F1 Score, Area Under Curve (AUC) 38 | - **Caveats:** Requires large amounts of data; can be prone to overfitting if not regularized properly; less interpretable compared to traditional models. 39 | 40 | ### 8. **K-Means Clustering** 41 | - **Loss Function:** Sum of Squared Errors (SSE) 42 | - **Performance Metrics:** Silhouette Score, Davies-Bouldin Index 43 | - **Caveats:** Assumes spherical clusters; sensitive to initial centroid placement; requires specifying the number of clusters in advance. 44 | 45 | ## Summary of Loss Functions and Performance Metrics 46 | 47 | | Algorithm | Loss Function | Performance Metrics | 48 | |------------------------|------------------------------------|----------------------------------------| 49 | | Linear Regression | Mean Squared Error | R², MAE | 50 | | Logistic Regression | Binary Cross-Entropy | Accuracy, F1 Score | 51 | | Decision Trees | Gini Impurity / MSE | Accuracy, MAE | 52 | | Support Vector Machines | Hinge Loss | Accuracy, Precision | 53 | | Random Forest | MSE / Gini Impurity | Out-of-Bag Error | 54 | | Gradient Boosting | Log Loss / MSE | RMSE | 55 | | Neural Networks | Cross-Entropy / MSE | Accuracy, AUC | 56 | | K-Means Clustering | Sum of Squared Errors | Silhouette Score | 57 | 58 | ## Conclusion 59 | 60 | The choice of algorithm depends on the specific characteristics of the dataset and the nature of the problem being solved. Understanding the strengths and weaknesses of each algorithm helps in selecting the most appropriate one for a given task. For instance, while linear regression is simple and interpretable, it may not capture complex relationships in the data. Conversely, neural networks can model intricate patterns but require more data and computational power. 61 | 62 | Citations: 63 | 64 | [1] https://www.datacamp.com/tutorial/loss-function-in-machine-learning
65 | [2] https://builtin.com/machine-learning/common-loss-functions
66 | [3] https://www.ibm.com/think/topics/loss-function
67 | [4] https://neptune.ai/blog/performance-metrics-in-machine-learning-complete-guide
68 | [5] https://www.geeksforgeeks.org/ml-common-loss-functions/
69 | [6] https://www.linkedin.com/pulse/performance-metrics-loss-function-machine-learning-alok-choudhary-zou7c
70 | [7] https://towardsdatascience.com/estimators-loss-functions-optimizers-core-of-ml-algorithms-d603f6b0161a
71 | -------------------------------------------------------------------------------- /DataScience/MlAlgoKeyFormulae.md: -------------------------------------------------------------------------------- 1 | ## Popular Algorithms in Data Science with Mathematical Formulations 2 | 3 | Here is an expanded overview of popular algorithms in data science, including their mathematical formulations, loss functions, performance metrics, and caveats. 4 | 5 | ### 1. **Linear Regression** 6 | - **Mathematical Formula:** 7 | $$ y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + ... + \beta_n x_n + \epsilon $$ 8 | where $$y$$ is the dependent variable, $$x_i$$ are independent variables, $$\beta_i$$ are coefficients, and $$\epsilon$$ is the error term. 9 | - **Loss Function:** Mean Squared Error (MSE) 10 | $$ MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 $$ 11 | - **Performance Metrics:** R-squared (R²), Adjusted R², Mean Absolute Error (MAE) 12 | - **Caveats:** Sensitive to outliers; performs poorly with non-linear relationships. 13 | 14 | ### 2. **Logistic Regression** 15 | - **Mathematical Formula:** 16 | $$ P(Y=1|X) = \frac{1}{1 + e^{-(\beta_0 + \beta_1 x_1 + ... + \beta_n x_n)}} $$ 17 | - **Loss Function:** Binary Cross-Entropy Loss (Log Loss) 18 | $$ L = -\frac{1}{n} \sum_{i=1}^{n} [y_i \log(\hat{y}_i) + (1-y_i) \log(1-\hat{y}_i)] $$ 19 | - **Performance Metrics:** Accuracy, Precision, Recall, F1 Score 20 | - **Caveats:** Assumes linearity in log odds; not suitable for multi-class without modification. 21 | 22 | ### 3. **Decision Trees** 23 | - **Mathematical Formula:** 24 | - For classification using Gini Impurity: 25 | $$ Gini(D) = 1 - \sum_{j=1}^{C} p_j^2 $$ 26 | where $$p_j$$ is the proportion of class $$j$$ in dataset $$D$$. 27 | - For regression: 28 | $$ MSE(D) = \frac{1}{|D|} \sum_{i=1}^{|D|} (y_i - \bar{y})^2 $$ 29 | where $$y_i$$ are the actual values and $$\bar{y}$$ is the mean of $$y$$. 30 | - **Loss Function:** Gini Impurity or Mean Squared Error 31 | - **Performance Metrics:** Accuracy, MAE 32 | - **Caveats:** Prone to overfitting; sensitive to data changes. 33 | 34 | ### 4. **Support Vector Machines (SVM)** 35 | - **Mathematical Formula:** 36 | $$ f(x) = w^T x + b $$ 37 | where $$w$$ is the weight vector and $$b$$ is the bias. 38 | - **Loss Function:** Hinge Loss 39 | $$ L(y, f(x)) = \max(0, 1 - y f(x)) $$ 40 | - **Performance Metrics:** Accuracy, Precision, Recall 41 | - **Caveats:** Computationally expensive for large datasets; requires careful tuning of hyperparameters. 42 | 43 | ### 5. **Random Forest** 44 | - **Mathematical Formula:** 45 | The prediction is made by averaging the predictions from multiple decision trees: 46 | $$ \hat{y} = \frac{1}{N} \sum_{i=1}^{N} T_i(x) $$ 47 | where $$T_i$$ are individual trees. 48 | - **Loss Function:** Mean Squared Error or Gini Impurity 49 | - **Performance Metrics:** Out-of-Bag Error, Accuracy 50 | - **Caveats:** Less interpretable than single trees; requires significant computational resources. 51 | 52 | ### 6. **Gradient Boosting Machines (GBM)** 53 | - **Mathematical Formula:** 54 | $$ F(x) = F_{m-1}(x) + \gamma_m h_m(x) $$ 55 | where $$h_m(x)$$ is the new tree added at iteration $$m$$. 56 | - **Loss Function:** Log Loss or Mean Squared Error 57 | - **Performance Metrics:** RMSE 58 | - **Caveats:** Sensitive to overfitting; requires careful tuning of learning rate and tree depth. 59 | 60 | ### 7. **Neural Networks** 61 | - **Mathematical Formula:** 62 | $$ y = f(WX + b) $$ 63 | where $$W$$ are weights, $$X$$ is input data, and $$b$$ is bias. 64 | - **Loss Function:** Cross-Entropy Loss or Mean Squared Error 65 | - Cross-Entropy for classification: 66 | $$ L = -\frac{1}{n} \sum_{i=1}^{n} [y_i \log(\hat{y}_i)] $$ 67 | - **Performance Metrics:** Accuracy, F1 Score, AUC 68 | - **Caveats:** Requires large amounts of data; less interpretable than traditional models. 69 | 70 | ### 8. **K-Means Clustering** 71 | - **Mathematical Formula:** 72 | $$ J = \sum_{i=1}^{k} \sum_{j=1}^{n} ||x_j^{(i)} - c_i||^2 $$ 73 | where $$c_i$$ are centroids and $$x_j^{(i)}$$ are data points assigned to cluster $$i$$. 74 | - **Loss Function:** Sum of Squared Errors (SSE) 75 | - **Performance Metrics:** Silhouette Score, Davies-Bouldin Index 76 | - **Caveats:** Assumes spherical clusters; sensitive to initial centroid placement. 77 | 78 | ## Summary of Formulations 79 | 80 | | Algorithm | Mathematical Formula | Loss Function | Performance Metrics | 81 | |------------------------|-------------------------------------------------------------------------------------|------------------------------------|----------------------------------------| 82 | | Linear Regression | $$ y = \beta_0 + \beta_1 x_1 + ... + \beta_n x_n + \epsilon $$ | MSE | R², MAE | 83 | | Logistic Regression | $$ P(Y=1|X) = \frac{1}{1 + e^{-(\beta_0 + ... + \beta_n x_n)}} $$ | Binary Cross-Entropy | Accuracy, F1 Score | 84 | | Decision Trees | Gini: $$ Gini(D) = 1 - \sum p_j^2 $$ | Gini Impurity / MSE | Accuracy, MAE | 85 | | Support Vector Machines | $$ f(x) = w^T x + b $$ | Hinge Loss | Accuracy, Precision | 86 | | Random Forest | $$ \hat{y} = \frac{1}{N} \sum T_i(x) $$ | MSE / Gini Impurity | Out-of-Bag Error | 87 | | Gradient Boosting | $$ F(x) = F_{m-1}(x) + h_m(x) $$ | Log Loss / MSE | RMSE | 88 | | Neural Networks | $$ y = f(WX + b) $$ | Cross-Entropy / MSE | Accuracy, AUC | 89 | | K-Means Clustering | $$ J = \sum ||x_j^{(i)} - c_i||^2 $$ | SSE | Silhouette Score | 90 | 91 | This comprehensive overview provides insights into each algorithm's mathematical foundation along with its practical applications and limitations. Understanding these aspects can help in selecting the right algorithm for specific data science tasks. 92 | 93 | Citations: 94 | [1] https://www.kdnuggets.com/2020/01/decision-tree-algorithm-explained.html 95 | [2] http://fiascodata.blogspot.com/2018/08/decision-tree-mathematical-formulation.html 96 | [3] https://en.wikipedia.org/wiki/Decision_tree_learning 97 | [4] https://www.datascienceprophet.com/understanding-the-mathematics-behind-the-decision-tree-algorithm-part-i/ 98 | [5] https://towardsdatascience.com/the-mathematics-of-decision-trees-random-forest-and-feature-importance-in-scikit-learn-and-spark-f2861df67e3?gi=36fa533e014a 99 | [6] https://www.datacamp.com/tutorial/loss-function-in-machine-learning 100 | [7] https://neptune.ai/blog/performance-metrics-in-machine-learning-complete-guide 101 | [8] https://towardsdatascience.com/estimators-loss-functions-optimizers-core-of-ml-algorithms-d603f6b0161a?gi=5432fa9d3888 102 | -------------------------------------------------------------------------------- /DataScience/study_plan.md: -------------------------------------------------------------------------------- 1 | # Statistics 2 | * T test 3 | * Z test 4 | * ANOVA 5 | * Chi Square 6 | * Correlation 7 | * Covariance 8 | * Hypothesis Testing 9 | 10 | # Classic ML 11 | * Linear Regression 12 | * Logistic Regression 13 | * Regulazisation (Rigde and Lasso) 14 | * Cost Functions 15 | * Decision Tree 16 | * Random Forest 17 | * Ensemble Learning 18 | * Bagging and Boosting 19 | * XGBoost 20 | * LightGBM 21 | 22 | # Hyperparamter Tuning 23 | * Grid Search 24 | * Random Search 25 | * HyperOpt 26 | * Feature Selection - PCA 27 | 28 | # Normmaliztion 29 | * Imbalance Dataaet 30 | * Imputing Missing data 31 | * Handling Outliers 32 | * Cross Validation 33 | 34 | # Clustering 35 | * K-Means clsutering 36 | * KNN 37 | * Principal Component Analysis 38 | 39 | # Perfromance Measures 40 | * R-square 41 | * Adjusted R-square 42 | * Mean Square Error 43 | * Root Mean Square Error 44 | * MAPE 45 | * Mean Absolute Error 46 | 47 | * Recall 48 | * Precision 49 | * Accuracy 50 | * F1-Score 51 | * ROC-AUC 52 | * Confusion Matrix 53 | 54 | * Type1 Error 55 | * Type2 Error 56 | * True Positive Rate 57 | * False Positive Rate 58 | 59 | 60 | # Adavnced ML 61 | * CNN 62 | * RCNN 63 | * LSTM 64 | * Transfromers 65 | * BERT 66 | 67 | 68 | # Time Series 69 | * Trend 70 | * Seasonality 71 | * Irregualrity 72 | * Cyclicity 73 | * Stationality 74 | * ADF 75 | * Making data stationary 76 | * White Noise 77 | * Holt Winters 78 | * FB-Prophet 79 | 80 | 81 | # Drift Detection 82 | * Type of drifts 83 | * KS Test 84 | * KL Divergence 85 | * Wassertein distance 86 | * ADWIN 87 | 88 | # NLP 89 | * Stemming 90 | * Lemmatization 91 | * TF-IDF 92 | * Word2Vec 93 | * Bag of Words models 94 | * Spacy 95 | 96 | # MLOPS 97 | * MLFlow 98 | * Model Registry 99 | * Data Versioning 100 | * Artifacts 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /GenerativeAI/1.1.What does generative truly mean.md: -------------------------------------------------------------------------------- 1 | In the context of deep learning, **generative** refers to models that are capable of generating new data samples that are similar to the training data they were trained on. These models learn the underlying probability distribution of the training data and use it to create novel samples[1][2]. 2 | 3 | The key principles behind generative deep learning models are: 4 | 5 | ## Learning the Data Distribution 6 | 7 | Generative models learn the probability distribution of the training data. This allows them to generate new samples that are statistically similar to the original data[2]. 8 | 9 | ## Sampling from the Learned Distribution 10 | 11 | Once the model has learned the data distribution, it can sample from this distribution to generate new samples. This sampling process introduces randomness, which allows the model to produce varied outputs for the same input[1]. 12 | 13 | ## Adversarial Training (GANs) 14 | 15 | One popular type of generative model is the Generative Adversarial Network (GAN). GANs consist of two neural networks - a generator and a discriminator. The generator generates new samples, while the discriminator tries to distinguish between real and generated samples. Through this adversarial training process, the generator learns to produce more realistic samples that can fool the discriminator[2]. 16 | 17 | ## Variational Autoencoders (VAEs) 18 | 19 | Another important class of generative models are Variational Autoencoders (VAEs). VAEs learn a latent representation of the data and use this representation to generate new samples. They are trained to maximize the likelihood of the training data under the learned generative model[3]. 20 | 21 | In summary, generative deep learning models learn the underlying probability distribution of the training data and use this knowledge to generate novel samples that are statistically similar to the original data. This allows them to create impressive outputs like realistic images, coherent text, and plausible audio[3][4][5]. 22 | 23 | Citations: 24 | [1] https://www.cmu.edu/intelligentbusiness/expertise/genai-principles.pdf 25 | [2] https://www.sixsigmacertificationcourse.com/the-basic-principles-of-generative-models-with-an-example/ 26 | [3] https://www.shroffpublishers.com/books/9789355429988/ 27 | [4] https://www.amazon.in/Generative-Deep-Learning-David-Foster-ebook/dp/B0C3WVJWBF 28 | [5] https://www.amazon.in/Deep-Learning-Scratch-Building-Principles/dp/935213902X 29 | -------------------------------------------------------------------------------- /GenerativeAI/1.2 Next Word Prediction.md: -------------------------------------------------------------------------------- 1 | Next word prediction is a fundamental task in Natural Language Processing (NLP) that involves predicting the most likely word to follow a given sequence of words. This task has evolved significantly with the advent of deep learning models, particularly the Transformer architecture, which has transformed the landscape of NLP. 2 | 3 | ## Evolution of Next Word Prediction Models 4 | 5 | ### Early Models: RNNs, LSTMs, and GRUs 6 | 7 | Before the introduction of Transformers, next word prediction was primarily handled by Recurrent Neural Networks (RNNs) and their variants, such as Long Short-Term Memory (LSTM) networks and Gated Recurrent Units (GRU). 8 | 9 | - **RNNs** maintain hidden states that capture information from previous inputs, allowing them to process sequences of data. However, they often struggle with long-range dependencies due to issues like the vanishing gradient problem. 10 | 11 | - **LSTMs** were designed to overcome these limitations by introducing memory cells that can store and retrieve information over longer sequences, making them effective for capturing long-term dependencies. 12 | 13 | - **GRUs** simplify the LSTM architecture by merging the cell state and hidden state, providing a more computationally efficient alternative while still managing to capture long-range dependencies effectively[1]. 14 | 15 | These models laid the groundwork for understanding sequential data and context in language, but they were limited by their sequential processing nature, which hindered parallelization and scalability. 16 | 17 | ## The Transformer Architecture 18 | 19 | Introduced in the groundbreaking paper "Attention Is All You Need" by Vaswani et al. in 2017, the Transformer model revolutionized next word prediction by eliminating the recurrence mechanism entirely. Instead, it relies on a self-attention mechanism that allows it to process all words in a sequence simultaneously, capturing relationships between words regardless of their distance from each other in the text. 20 | 21 | ### Key Components of Transformers 22 | 23 | 1. **Self-Attention Mechanism**: This mechanism allows the model to weigh the importance of different words in the input sequence when making predictions. Each word can attend to all other words, enabling the model to capture complex dependencies and contextual relationships effectively. 24 | 25 | 2. **Positional Encoding**: Since Transformers do not process sequences in order, they use positional encodings to retain information about the position of words within the sequence. This helps the model understand the order of words, which is crucial for language comprehension. 26 | 27 | 3. **Encoder-Decoder Structure**: The Transformer consists of an encoder that processes the input sequence and a decoder that generates the output sequence. Each encoder and decoder layer employs self-attention and feed-forward networks, allowing for efficient learning of language patterns[2][3]. 28 | 29 | ### Advantages of Transformers 30 | 31 | Transformers offer several advantages over previous models: 32 | 33 | - **Parallelization**: Unlike RNNs, which process inputs sequentially, Transformers can process entire sequences simultaneously, significantly speeding up training. 34 | 35 | - **Long-Range Dependencies**: The self-attention mechanism enables better handling of long-range dependencies, allowing the model to consider the entire context when predicting the next word. 36 | 37 | - **Scalability**: Transformers can be scaled up easily, leading to the development of large language models (LLMs) like GPT-3 and BERT, which have demonstrated remarkable performance across various NLP tasks, including next word prediction[4][5]. 38 | 39 | ## Conclusion 40 | 41 | The transition from RNNs and their variants to the Transformer architecture marks a significant advancement in next word prediction capabilities. Transformers have not only improved the efficiency and accuracy of predictions but have also paved the way for the development of sophisticated language models that can understand and generate human-like text. This evolution underscores the importance of architectural innovations in enhancing the performance of NLP applications. 42 | 43 | Citations: 44 | [1] https://www.geeksforgeeks.org/next-word-prediction-with-deep-learning-in-nlp/ 45 | [2] https://datasciencedojo.com/blog/transformer-models/ 46 | [3] https://en.wikipedia.org/wiki/Transformer_%28machine_learning_model%29 47 | [4] https://www.leewayhertz.com/decision-transformer/ 48 | [5] https://towardsdatascience.com/transformers-141e32e69591 49 | [6] https://www.datacamp.com/tutorial/how-transformers-work 50 | [7] https://www.geeksforgeeks.org/getting-started-with-transformers/ 51 | [8] https://www.techscience.com/cmc/v78n3/55891/html 52 | -------------------------------------------------------------------------------- /GenerativeAI/1.3 Embedding Process. - Mathematical Intution.md: -------------------------------------------------------------------------------- 1 | ## The Intuition Behind Embeddings in Transformers 2 | 3 | Embeddings are a fundamental component of Transformer models, allowing them to represent words and tokens as numerical vectors that can be processed by neural networks. The embedding process maps discrete tokens (like words) into a continuous vector space, where similar tokens are positioned close together. This embedding space captures semantic and syntactic relationships between tokens. 4 | 5 | Some key characteristics of embeddings that make them useful for Transformers: 6 | 7 | ### Continuous Representation 8 | Embeddings represent tokens as continuous vectors, rather than discrete indices. This allows the model to learn smooth relationships between tokens, enabling better generalization. 9 | 10 | ### Dimensionality Reduction 11 | High-dimensional one-hot encoded token representations are mapped to a much lower dimensional embedding space (e.g. 300 dimensions). This dimensionality reduction allows the model to efficiently process and store token representations. 12 | 13 | ### Semantic Relationships 14 | The embedding space encodes semantic relationships between tokens. For example, the vector for "king" - "man" + "woman" points to the vector for "queen"[1]. These relationships emerge from the training data. 15 | 16 | ### Parallelization 17 | Embeddings allow the model to process all tokens in parallel, rather than sequentially. This is important for the self-attention mechanism in Transformers, which computes relationships between all pairs of tokens[3]. 18 | 19 | ### Transfer Learning 20 | Pre-trained embeddings, like those from BERT, can be fine-tuned on specific tasks. The embeddings capture general language knowledge that can be leveraged for various applications[5]. 21 | 22 | ### Intuitive Visualization 23 | Embeddings can be visualized in 2D or 3D space to gain intuitions about the model's internal representations. Semantically similar tokens cluster together in the embedding space[4]. 24 | 25 | Mathematically, an embedding space is a manifold in which similar items are positioned closer to one another than dissimilar items[6]. The embedding process maps discrete tokens to points on this manifold, preserving semantic relationships. Transformers leverage these properties of embeddings to efficiently process and reason about language. 26 | 27 | Citations: 28 | [1] https://towardsdatascience.com/analyzing-transformers-in-embedding-space-explained-ef72130a6844?gi=ecd132be68ed 29 | [2] https://news.ycombinator.com/item?id=40497379 30 | [3] https://towardsdatascience.com/transformers-intuitively-and-exhaustively-explained-58a5c5df8dbb 31 | [4] https://encord.com/blog/embeddings-machine-learning/ 32 | [5] https://www.datacamp.com/tutorial/how-transformers-work 33 | [6] https://stackoverflow.blog/2023/11/09/an-intuitive-introduction-to-text-embeddings/ 34 | [7] https://www.ibm.com/think/topics/vector-embedding 35 | [8] https://www.geeksforgeeks.org/word-embeddings-in-nlp/ 36 | -------------------------------------------------------------------------------- /GenerativeAI/1.4 Attention Block - Python Example.md: -------------------------------------------------------------------------------- 1 | The attention mechanism in Transformers is a powerful mathematical framework that enables models to focus on different parts of the input sequence, allowing for better understanding of context and relationships within the data. This is particularly useful in tasks such as natural language processing and image recognition. 2 | 3 | ## Mathematical Intuition of Attention Block 4 | 5 | ### Key Concepts 6 | 7 | 1. **Queries, Keys, and Values**: In the context of attention, each input is transformed into three vectors: 8 | - **Query (Q)**: Represents the item for which we want to find relevant information. 9 | - **Key (K)**: Represents the items in the input that can provide information. 10 | - **Value (V)**: Represents the actual information associated with each key. 11 | 12 | 2. **Scaled Dot-Product Attention**: The attention score between queries and keys is computed using the dot product, scaled by the square root of the dimension of the key vectors, followed by a softmax operation to obtain attention weights. The output is then a weighted sum of the value vectors. 13 | 14 | The formula for the attention mechanism can be summarized as: 15 | 16 | $$ 17 | \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V 18 | $$ 19 | 20 | where $$d_k$$ is the dimension of the key vectors. 21 | 22 | 3. **Multi-Head Attention**: Instead of performing a single attention function, multiple attention heads are used. Each head learns different representations by applying the attention mechanism independently and then concatenating their outputs. 23 | 24 | ### End-to-End Process Example 25 | 26 | To illustrate the attention mechanism, we can implement a simple version using Python and NumPy. Below is a step-by-step example. 27 | 28 | ```python 29 | import numpy as np 30 | 31 | # Define input dimensions 32 | d_model = 4 # Dimension of the model 33 | d_k = 2 # Dimension of keys and queries 34 | d_v = 2 # Dimension of values 35 | num_heads = 2 # Number of attention heads 36 | 37 | # Sample input data (3 tokens in the sequence, each represented by a vector of size d_model) 38 | X = np.array([[1, 0, 1, 0], 39 | [0, 1, 0, 1], 40 | [1, 1, 1, 1]]) 41 | 42 | # Randomly initialize weight matrices for queries, keys, and values 43 | W_Q = np.random.rand(d_model, d_k) 44 | W_K = np.random.rand(d_model, d_k) 45 | W_V = np.random.rand(d_model, d_v) 46 | 47 | # Compute queries, keys, and values. @ is the Matrix Multiplication Op. 48 | Q = X @ W_Q 49 | K = X @ W_K 50 | V = X @ W_V 51 | 52 | # Compute attention scores 53 | scores = Q @ K.T / np.sqrt(d_k) # Scale scores 54 | attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) # Softmax 55 | 56 | # Compute output 57 | output = attention_weights @ V 58 | 59 | print("Queries:\n", Q) 60 | print("Keys:\n", K) 61 | print("Values:\n", V) 62 | print("Attention Weights:\n", attention_weights) 63 | print("Output:\n", output) 64 | ``` 65 | 66 | ### Explanation of the Code 67 | 68 | 1. **Input Data**: We define a simple input matrix `X` representing three tokens, each with a feature vector of size `d_model`. 69 | 70 | 2. **Weight Matrices**: Random weight matrices `W_Q`, `W_K`, and `W_V` are initialized for transforming the input into queries, keys, and values. 71 | 72 | 3. **Computing Q, K, V**: The input matrix is multiplied by the corresponding weight matrices to obtain the queries, keys, and values. 73 | 74 | 4. **Attention Scores**: The dot product of queries and keys is computed, scaled, and passed through a softmax function to obtain attention weights. 75 | 76 | 5. **Output Calculation**: The final output is computed as a weighted sum of the values based on the attention weights. 77 | 78 | This example demonstrates the core functionality of the attention mechanism, capturing the relationships between different tokens in the input sequence. The multi-head attention can be implemented similarly by repeating the process for multiple sets of weight matrices and concatenating the results. 79 | 80 | Citations: 81 | [1] https://learnopencv.com/attention-mechanism-in-transformer-neural-networks/ 82 | [2] https://transformer-circuits.pub/2021/framework/index.html 83 | [3] https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html 84 | [4] https://machinelearningmastery.com/the-transformer-attention-mechanism/ 85 | [5] https://towardsdatascience.com/the-math-behind-multi-head-attention-in-transformers-c26cba15f625 86 | [6] https://nlp.seas.harvard.edu/2018/04/03/attention.html 87 | [7] https://www.youtube.com/watch?v=kO0XdAsY5YA 88 | [8] https://towardsdatascience.com/transformers-intuitively-and-exhaustively-explained-58a5c5df8dbb 89 | -------------------------------------------------------------------------------- /GenerativeAI/1.5 MLP Block - Python Example.md: -------------------------------------------------------------------------------- 1 | ## Multi-Layer Perceptron (MLP) in Transformers 2 | 3 | The Multi-Layer Perceptron (MLP) is a key component of the Transformer architecture, responsible for refining the representation of each token using a non-linear transformation. Here's the mathematical intuition behind the MLP in Transformers: 4 | 5 | ### Mathematical Formulation 6 | 7 | The MLP in Transformers operates across the features of each token, applying the same non-linear transformation to each token independently. Given the output of the self-attention layer `y(m)_n` for token `n` at layer `m`, the MLP computes: 8 | 9 | $$ 10 | x^{(m+1)}_n = \text{MLP}_\theta(y^{(m)}_n) 11 | $$ 12 | 13 | where `\theta` represents the parameters of the MLP, which are shared across all tokens. 14 | 15 | The MLP typically consists of one or two hidden layers with a dimension equal to the number of features `D` (or larger). The computational cost of this step is roughly `N * D * D`, where `N` is the sequence length. 16 | 17 | ### Example Implementation in Python and NumPy 18 | 19 | Here's a simple example of implementing the MLP component in Transformers using Python and NumPy: 20 | 21 | ```python 22 | import numpy as np 23 | 24 | # Define MLP parameters 25 | D = 4 # Number of features 26 | hidden_size = 8 # Size of the hidden layer 27 | 28 | # Sample input from the self-attention layer 29 | y = np.array([[1, 0, 1, 0], 30 | [0, 1, 0, 1], 31 | [1, 1, 1, 1]]) 32 | 33 | # Initialize MLP weights 34 | W1 = np.random.rand(D, hidden_size) 35 | b1 = np.random.rand(1, hidden_size) 36 | W2 = np.random.rand(hidden_size, D) 37 | b2 = np.random.rand(1, D) 38 | 39 | # Compute MLP output 40 | h = np.maximum(0, y @ W1 + b1) # ReLU activation in the hidden layer 41 | x = h @ W2 + b2 # Linear output layer 42 | 43 | print("Input from self-attention layer:\n", y) 44 | print("Output of the MLP:\n", x) 45 | ``` 46 | 47 | In this example: 48 | 49 | 1. We define the MLP parameters, including the number of features `D` and the size of the hidden layer. 50 | 51 | 2. We create a sample input `y` from the self-attention layer. 52 | 53 | 3. We initialize the weights and biases of the MLP randomly. 54 | 55 | 4. We compute the output of the MLP by applying the following steps: 56 | - Compute the hidden layer activation using a ReLU non-linearity. 57 | - Apply the output layer weights and biases to obtain the final output. 58 | 59 | 5. Finally, we print the input from the self-attention layer and the output of the MLP. 60 | 61 | The MLP in Transformers acts as a non-linear feature extractor, processing the output of the self-attention layer independently for each token. It helps capture complex interactions between features and refine the representations learned by the self-attention mechanism. 62 | 63 | Citations: 64 | [1] https://www.youtube.com/watch?v=kO0XdAsY5YA 65 | [2] https://transformer-circuits.pub/2021/framework/index.html 66 | [3] https://arxiv.org/abs/2304.10557 67 | [4] https://learnopencv.com/attention-mechanism-in-transformer-neural-networks/ 68 | [5] https://arxiv.org/pdf/2304.10557.pdf 69 | [6] https://www.youtube.com/watch?v=idVm0DMaDR4 70 | [7] https://towardsdatascience.com/the-math-behind-multi-head-attention-in-transformers-c26cba15f625 71 | [8] https://www.youtube.com/watch?v=qw7wFGgNCSU 72 | -------------------------------------------------------------------------------- /GenerativeAI/1.6 Positional Encoding - Python Example.md: -------------------------------------------------------------------------------- 1 | ## Positional Encoding in Transformers 2 | 3 | Positional encoding is a critical component of the Transformer architecture, designed to provide information about the position of tokens in a sequence. Unlike recurrent neural networks (RNNs), which inherently process sequences in order, Transformers process all tokens in parallel. This parallel processing means that Transformers lack an inherent understanding of the order of tokens, making positional encodings essential. 4 | 5 | ### Mathematical Intuition 6 | 7 | The primary goal of positional encoding is to inject information about the position of each token in the input sequence. The positional encoding for a token at position $$ p $$ in a sequence is defined using sine and cosine functions of varying frequencies, as follows: 8 | 9 | - For even indices: 10 | $$ 11 | PE(p, 2i) = \sin\left(\frac{p}{10000^{2i/d_{\text{model}}}}\right) 12 | $$ 13 | 14 | - For odd indices: 15 | $$ 16 | PE(p, 2i+1) = \cos\left(\frac{p}{10000^{2i/d_{\text{model}}}}\right) 17 | $$ 18 | 19 | Where: 20 | - $$ p $$ is the position of the token in the sequence. 21 | - $$ i $$ is the dimension index. 22 | - $$ d_{\text{model}} $$ is the total number of dimensions in the embedding. 23 | 24 | This formulation allows each position to have a unique encoding, and the use of sine and cosine functions ensures that the positional encodings can capture relative positions. The geometric progression of frequencies allows the model to learn to attend to relative positions effectively. 25 | 26 | ### End-to-End Process Example 27 | 28 | To illustrate how positional encoding works in practice, we can implement it using Python and NumPy. Below is a step-by-step example. 29 | 30 | ```python 31 | import numpy as np 32 | 33 | def positional_encoding(max_len, d_model): 34 | # Initialize the positional encoding matrix 35 | pos_enc = np.zeros((max_len, d_model)) 36 | 37 | # Compute positional encodings 38 | for p in range(max_len): 39 | for i in range(0, d_model, 2): 40 | pos_enc[p, i] = np.sin(p / (10000 ** (2 * i / d_model))) 41 | if i + 1 < d_model: 42 | pos_enc[p, i + 1] = np.cos(p / (10000 ** (2 * i / d_model))) 43 | 44 | return pos_enc 45 | 46 | # Example parameters 47 | max_len = 10 # Maximum length of the input sequence 48 | d_model = 4 # Dimension of the embedding 49 | 50 | # Compute positional encodings 51 | pos_encodings = positional_encoding(max_len, d_model) 52 | 53 | print("Positional Encodings:\n", pos_encodings) 54 | ``` 55 | 56 | ### Explanation of the Code 57 | 58 | 1. **Function Definition**: The `positional_encoding` function takes two parameters: `max_len` (the maximum length of the input sequence) and `d_model` (the dimensionality of the embedding). 59 | 60 | 2. **Matrix Initialization**: A zero matrix `pos_enc` is initialized to store the positional encodings. 61 | 62 | 3. **Computing Encodings**: Two nested loops iterate over each position $$ p $$ and dimension $$ i $$: 63 | - For even indices, the sine function is applied. 64 | - For odd indices, the cosine function is applied. 65 | 66 | 4. **Output**: The resulting positional encodings matrix is printed, showing the positional information for each position in the sequence. 67 | 68 | ### Summary 69 | 70 | Positional encoding is essential in the Transformer architecture, allowing the model to incorporate information about the order of tokens in a sequence. By using sine and cosine functions, positional encodings provide unique representations for each position, enabling the model to learn relationships between tokens effectively. This approach enhances the model's ability to process sequences without losing the critical information of token order. 71 | 72 | Citations: 73 | [1] https://www.geeksforgeeks.org/positional-encoding-in-transformers/ 74 | [2] https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/ 75 | [3] https://kazemnejad.com/blog/transformer_architecture_positional_encoding/ 76 | [4] https://www.youtube.com/watch?v=kO0XdAsY5YA 77 | [5] https://nlp.seas.harvard.edu/2018/04/03/attention.html 78 | [6] https://www.linkedin.com/pulse/deep-dive-positional-encodings-transformer-neural-network-ajay-taneja 79 | [7] https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html 80 | [8] https://www.youtube.com/watch?v=ZMxVe-HK174 81 | -------------------------------------------------------------------------------- /GenerativeAI/1.7 End to End process of Attention.md: -------------------------------------------------------------------------------- 1 | To illustrate the functioning of the attention block in Transformers, let's break down the process using a sample sentence, its embedding vector, and the attention mechanism step-by-step. We will also show how to compute the probability distribution of the next best word based on the attention scores. 2 | 3 | ### Example Sentence 4 | 5 | Let's take the sentence: **"Life is short"**. 6 | 7 | ### Step 1: Word Embedding 8 | 9 | First, we need to convert the words into embedding vectors. For simplicity, we'll use random embeddings for each word. 10 | 11 | ```python 12 | import numpy as np 13 | 14 | # Define the sentence and create a dictionary for word indices 15 | sentence = "Life is short" 16 | words = sentence.split() 17 | word_to_index = {word: i for i, word in enumerate(words)} 18 | 19 | # Create random embeddings for each word 20 | embedding_dim = 4 # Dimension of the embedding 21 | embeddings = np.random.rand(len(words), embedding_dim) 22 | 23 | print("Word Indices:", word_to_index) 24 | print("Word Embeddings:\n", embeddings) 25 | ``` 26 | 27 | ### Step 2: Compute Queries, Keys, and Values 28 | 29 | In the attention mechanism, we need to compute the queries (Q), keys (K), and values (V) from the embeddings. We will use learned weight matrices for this purpose. 30 | 31 | ```python 32 | # Initialize weight matrices for Q, K, and V 33 | W_Q = np.random.rand(embedding_dim, embedding_dim) 34 | W_K = np.random.rand(embedding_dim, embedding_dim) 35 | W_V = np.random.rand(embedding_dim, embedding_dim) 36 | 37 | # Compute Q, K, V 38 | Q = embeddings @ W_Q 39 | K = embeddings @ W_K 40 | V = embeddings @ W_V 41 | 42 | print("Queries (Q):\n", Q) 43 | print("Keys (K):\n", K) 44 | print("Values (V):\n", V) 45 | ``` 46 | 47 | ### Step 3: Compute Attention Scores 48 | 49 | Next, we calculate the attention scores using the dot product of the queries and keys, followed by a softmax to obtain the attention weights. 50 | 51 | ```python 52 | # Compute attention scores 53 | scores = Q @ K.T / np.sqrt(embedding_dim) # Scale by the square root of the dimension 54 | attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) # Softmax 55 | 56 | print("Attention Scores:\n", scores) 57 | print("Attention Weights:\n", attention_weights) 58 | ``` 59 | 60 | ### Step 4: Compute Output of the Attention Block 61 | 62 | The output of the attention block is computed as a weighted sum of the values, using the attention weights. 63 | 64 | ```python 65 | # Compute the output of the attention block 66 | output = attention_weights @ V 67 | 68 | print("Output of Attention Block:\n", output) 69 | ``` 70 | 71 | ### Step 5: Probability Distribution for Next Word 72 | 73 | To predict the next word, we can apply a simple linear layer followed by a softmax function to the output of the attention block. This simulates how we would generate probabilities for the next word in a sequence. 74 | 75 | ```python 76 | # Initialize weights for the output layer 77 | W_out = np.random.rand(embedding_dim, len(words)) 78 | 79 | # Compute logits 80 | logits = output @ W_out 81 | 82 | # Compute probabilities using softmax 83 | probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True) 84 | 85 | print("Logits:\n", logits) 86 | print("Probability Distribution for Next Word:\n", probabilities) 87 | ``` 88 | 89 | ### Summary of the Process 90 | 91 | 1. **Word Embedding**: Convert words into embedding vectors. 92 | 2. **Compute Q, K, V**: Use learned weight matrices to compute queries, keys, and values from the embeddings. 93 | 3. **Attention Scores**: Calculate scores using the dot product of queries and keys, then apply softmax to obtain attention weights. 94 | 4. **Output of Attention Block**: Compute the output as a weighted sum of the values based on the attention weights. 95 | 5. **Next Word Probability**: Generate a probability distribution for the next word using a linear transformation followed by softmax. 96 | 97 | ### Final Output 98 | 99 | The final output will show the probability distribution of the next best word based on the attention mechanism applied to the input sentence. This allows the model to capture the context and relationships between the words effectively. 100 | 101 | Citations: 102 | [1] https://nlp.gluon.ai/examples/sentence_embedding/self_attentive_sentence_embedding.html 103 | [2] https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html 104 | [3] https://datascience.stackexchange.com/questions/95134/how-to-encode-a-sentence-using-an-attention-mechanism 105 | [4] https://towardsdatascience.com/contextual-transformer-embeddings-using-self-attention-explained-with-diagrams-and-python-code-d7a9f0f4d94e?gi=0dee21177e82 106 | [5] https://github.com/gazelle93/Transformer-Various-Positional-Encoding 107 | [6] https://www.linkedin.com/pulse/deep-dive-positional-encodings-transformer-neural-network-ajay-taneja 108 | [7] https://towardsdatascience.com/all-you-need-to-know-about-attention-and-transformers-in-depth-understanding-part-1-552f0b41d021?gi=4b6a109307fe 109 | [8] https://stackoverflow.blog/2023/11/09/an-intuitive-introduction-to-text-embeddings/ 110 | -------------------------------------------------------------------------------- /GenerativeAI/2. How does FAISS work.md: -------------------------------------------------------------------------------- 1 | Here is an explanation of how Facebook AI Similarity Search (FAISS) works, from beginner to advanced, using the example query vector "I like to play football": 2 | 3 | ## Beginner Explanation 4 | 5 | FAISS is a library that allows you to quickly find similar items in a large dataset of vectors. For example, if you have a sentence embedding vector for the query "I like to play football", FAISS can efficiently search through millions or billions of other sentence embedding vectors to find the ones that are most similar. 6 | 7 | To use FAISS, you first need to create an index from your dataset of vectors. This involves some preprocessing to optimize the index for fast similarity search. Then, when you have a query vector like "I like to play football", you can pass it to FAISS to search the index and get back the most similar vectors, ranked by similarity score. 8 | 9 | FAISS uses techniques like quantization and efficient distance computation to make the search much faster than a brute force approach of comparing the query to every vector in the dataset one by one. 10 | 11 | ## Intermediate Explanation 12 | 13 | Let's say you have a dataset of 1 billion sentence embedding vectors, and you want to find the 10 most similar vectors to "I like to play football". Here's how FAISS would work: 14 | 15 | 1. **Preprocessing**: FAISS builds an index data structure from the 1 billion vectors. This involves partitioning the vectors into clusters and encoding them using product quantization to reduce memory usage[1][2]. 16 | 17 | 2. **Searching**: When you pass the query vector "I like to play football" to FAISS, it first identifies which clusters the query is closest to. It then only compares the query to the vectors within those clusters, rather than all 1 billion vectors[3]. 18 | 19 | 3. **Ranking**: FAISS computes the similarity scores between the query and the vectors in the relevant clusters. It returns the 10 vectors with the highest scores, which are the most similar to the query[4]. 20 | 21 | FAISS is highly optimized for this process, using techniques like multi-threading and GPU acceleration to make the search extremely fast, even on a dataset of 1 billion vectors[1][5]. 22 | 23 | ## Advanced Explanation 24 | 25 | Under the hood, FAISS uses advanced indexing algorithms to enable efficient similarity search. Some key components are: 26 | 27 | - **Inverted file index (IVF)**: This partitions the vector space into Voronoi cells. For a given query, FAISS first identifies the cells it is closest to, narrowing down the search[3]. 28 | 29 | - **Product quantization (PQ)**: Vectors are decomposed into subvectors which are quantized separately. This allows FAISS to store an approximation of the vectors very compactly in RAM[1][2]. 30 | 31 | - **Hierarchical navigable small world (HNSW) graph**: An efficient nearest neighbor graph structure that allows fast traversal to find similar vectors[4]. 32 | 33 | FAISS provides a range of indexing algorithms that make different time/accuracy/memory tradeoffs. The most accurate is IVF with PQ, which is what FAISS would likely use for a query like "I like to play football" on a large dataset[1][2][3]. 34 | 35 | By leveraging these advanced indexing techniques, FAISS is able to provide state-of-the-art similarity search performance, enabling applications like efficient semantic search, personalized recommendations, and content-based retrieval[4][5]. 36 | 37 | Citations: 38 | [1] https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/ 39 | [2] https://ai.meta.com/tools/faiss/ 40 | [3] https://www.pinecone.io/learn/series/faiss/faiss-tutorial/ 41 | [4] https://www.activeloop.ai/resources/glossary/faiss-facebook-ai-similarity-search/ 42 | [5] https://www.linkedin.com/pulse/exploring-power-facebook-ai-similarity-search-library-venkatesh-mungi-6ncof 43 | -------------------------------------------------------------------------------- /GenerativeAI/3. FAISS Advanced explaination.md: -------------------------------------------------------------------------------- 1 | To expand on the advanced explanation of Facebook AI Similarity Search (FAISS) and incorporate mathematical expressions, we will delve into the underlying mechanisms and algorithms used in FAISS, using the example query vector "I like to play football". 2 | 3 | ## Advanced Explanation of FAISS 4 | 5 | FAISS is designed for efficient similarity search and clustering of dense vectors, typically in high-dimensional spaces. The core idea is to index a large dataset of vectors so that we can quickly retrieve the most similar vectors to a given query vector. 6 | 7 | ### Key Components of FAISS 8 | 9 | 1. **Vector Representation**: 10 | Each sentence or item is represented as a vector in a high-dimensional space. For example, the sentence "I like to play football" might be encoded into a vector $$\mathbf{q}$$ of dimension $$d$$ (e.g., $$d = 768$$ for sentence embeddings). 11 | 12 | 2. **Distance Metrics**: 13 | FAISS supports various distance metrics for measuring similarity between vectors, including: 14 | 15 | - **L2 (Euclidean) Distance**: 16 | $$ 17 | D(\mathbf{x}, \mathbf{y}) = \sqrt{\sum_{i=1}^{d} (x_i - y_i)^2} 18 | $$ 19 | - **Inner Product** (used for cosine similarity when vectors are normalized): 20 | $$ 21 | D(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^{d} x_i \cdot y_i 22 | $$ 23 | 24 | 3. **Index Structures**: 25 | FAISS employs several indexing strategies to optimize search performance: 26 | 27 | - **Flat Index**: This is the simplest form, where all vectors are stored, and the search is performed using brute force. For a query vector $$\mathbf{q}$$, the search involves calculating the distance to every vector in the index. 28 | 29 | - **Inverted File Index (IVF)**: This partitions the vector space into clusters. Each cluster is represented by a centroid, and vectors are assigned to these clusters. The search process involves: 30 | 1. **Cluster Assignment**: For a query vector $$\mathbf{q}$$, find the nearest centroids using a coarse quantizer (e.g., using L2 distance). 31 | 2. **Refined Search**: Only search within the nearest clusters. 32 | 33 | - **Product Quantization (PQ)**: This technique compresses the vector representation to save memory. It divides each vector into $$M$$ subvectors and quantizes each subvector separately. The distance computation for a query vector $$\mathbf{q}$$ involves: 34 | $$ 35 | D(\mathbf{q}, \mathbf{c}) \approx \sum_{m=1}^{M} D(\mathbf{q}_m, \mathbf{c}_m) 36 | $$ 37 | where $$\mathbf{c}_m$$ is the quantized representation of the $$m^{th}$$ subvector. 38 | 39 | - **Hierarchical Navigable Small World (HNSW)**: This is a graph-based approach that allows for fast nearest neighbor searches. It constructs a multi-layer graph where each layer contains a subset of the vectors, enabling efficient traversal to find nearest neighbors. 40 | 41 | ### Example Search Process 42 | 43 | 1. **Index Creation**: 44 | Suppose we have a dataset of vectors representing various sentences, including our example. We would first create an index: 45 | ```python 46 | import faiss 47 | d = 768 # Example dimension 48 | index = faiss.IndexIVFPQ(faiss.IndexFlatL2(d), d, nlist=100, M=16, nbits=8) 49 | index.train(training_vectors) # Train the index with a subset of vectors 50 | index.add(vectors) # Add all vectors to the index 51 | ``` 52 | 53 | 2. **Query Vector**: 54 | For the query "I like to play football", we encode it into a vector $$\mathbf{q}$$: 55 | ```python 56 | xq = model.encode(["I like to play football"]) 57 | ``` 58 | 59 | 3. **Search Execution**: 60 | To find the 4 nearest neighbors, we execute: 61 | ```python 62 | k = 4 63 | D, I = index.search(xq, k) # D contains distances, I contains indices of neighbors 64 | ``` 65 | 66 | 4. **Distance Calculation**: 67 | The distances $$D$$ are computed using the selected metric (e.g., L2 distance or inner product) based on the indexed vectors. 68 | 69 | 5. **Results Interpretation**: 70 | The indices returned in $$I$$ correspond to the most similar sentences in the dataset. For instance, if the results are: 71 | ``` 72 | I = [4586, 10252, 12465, 190] 73 | ``` 74 | These indices can be used to retrieve the actual sentences that are most similar to the query vector. 75 | 76 | ### Performance Optimization 77 | 78 | FAISS is optimized for performance through various means: 79 | 80 | - **GPU Acceleration**: FAISS can leverage GPU resources to handle large-scale datasets, significantly speeding up both indexing and searching processes. 81 | 82 | - **Parameter Tuning**: Users can adjust parameters such as the number of clusters ($$nlist$$) and the number of probes ($$nprobe$$) to balance between speed and accuracy. For example, increasing $$nprobe$$ allows for more clusters to be searched, potentially improving accuracy at the cost of speed. 83 | 84 | - **Memory Management**: By using quantization techniques like PQ, FAISS reduces memory usage, allowing it to handle larger datasets efficiently. 85 | 86 | By combining these techniques, FAISS achieves state-of-the-art performance in similarity search, making it suitable for applications such as semantic search, recommendation systems, and more. 87 | 88 | Citations: 89 | [1] https://github.com/facebookresearch/faiss/wiki/Faiss-indexes 90 | [2] https://unfoldai.com/effortless-large-scale-image-retrieval-with-faiss-a-hands-on-tutorial/ 91 | [3] https://www.youtube.com/watch?v=sKyvsdEv6rk 92 | [4] https://www.pinecone.io/learn/series/faiss/faiss-tutorial/ 93 | [5] https://ai.meta.com/tools/faiss/ 94 | -------------------------------------------------------------------------------- /GenerativeAI/4. Transformers and Vector DB Interview prep.md: -------------------------------------------------------------------------------- 1 | Here are 30 key questions and answers to help you prepare for a Generative AI interview, with a focus on Transformer architectures and vector databases: 2 | 3 | ## Transformer Architectures 4 | 5 | 1. **What are the key components of a Transformer architecture?** 6 | - Encoder and decoder layers 7 | - Attention mechanisms 8 | - Feed-forward neural networks 9 | - Layer normalization and residual connections 10 | 11 | 2. **How does the attention mechanism work in Transformers?** 12 | - Computes a weighted sum of values based on the compatibility between keys and queries 13 | - Allows the model to focus on relevant parts of the input sequence 14 | - Enables capturing long-range dependencies without relying on recurrence or convolutions 15 | 16 | 3. **What are the advantages of using Transformer architectures compared to RNNs and CNNs?** 17 | - Parallelization of computations 18 | - Ability to capture long-range dependencies 19 | - Improved performance on tasks like machine translation and language understanding 20 | 21 | 4. **Can you explain the concept of self-attention in Transformers?** 22 | - Attention mechanism applied to the same sequence 23 | - Allows the model to attend to different positions within the same sequence 24 | - Helps capture contextual information within a sequence 25 | 26 | 5. **How do Transformer architectures handle variable-length input sequences?** 27 | - Use of padding tokens and masking techniques 28 | - Padding is added to ensure all sequences have the same length 29 | - Masking is applied to ignore the contributions of padding tokens during attention computations 30 | 31 | 6. **What are the differences between encoder-only, decoder-only, and encoder-decoder Transformer architectures?** 32 | - Encoder-only: Used for tasks like language understanding (e.g., BERT) 33 | - Decoder-only: Used for autoregressive tasks like language generation (e.g., GPT) 34 | - Encoder-decoder: Used for sequence-to-sequence tasks like machine translation (e.g., Transformer) 35 | 36 | 7. **Can you explain the concept of positional encoding in Transformer architectures?** 37 | - Injects positional information into the input embeddings 38 | - Enables the model to understand the relative or absolute positions of tokens in the sequence 39 | - Common techniques include sinusoidal positional encoding and learned positional embeddings 40 | 41 | 8. **How do Transformer architectures handle long-range dependencies compared to RNNs and CNNs?** 42 | - Attention mechanisms allow for direct connections between distant tokens 43 | - Reduces the path length between related tokens 44 | - Enables better modeling of long-range dependencies 45 | 46 | 9. **What are the challenges and limitations of Transformer architectures?** 47 | - Quadratic complexity of attention with respect to sequence length 48 | - Memory and computational requirements can be high for long sequences 49 | - Potential for overfitting due to lack of inductive biases present in RNNs and CNNs 50 | 51 | 10. **Can you discuss some recent advancements and variants of Transformer architectures?** 52 | - Sparse Transformer: Reduces computational complexity by using sparse attention patterns 53 | - Reformer: Uses locality-sensitive hashing to efficiently compute attention 54 | - Longform Transformer: Designed for long-form text generation tasks 55 | 56 | ## Vector Databases 57 | 58 | 11. **What are vector databases, and how do they differ from traditional databases?** 59 | - Store data in the form of high-dimensional vectors 60 | - Optimized for similarity search and nearest neighbor retrieval 61 | - Differ from traditional databases in terms of data structure and query types 62 | 63 | 12. **What are the main use cases of vector databases in Generative AI?** 64 | - Semantic search and retrieval of relevant information for generation tasks 65 | - Storage and indexing of embeddings generated by Generative AI models 66 | - Efficient retrieval of similar examples for few-shot learning and prompting 67 | 68 | 13. **Can you explain the concept of approximate nearest neighbor (ANN) search in vector databases?** 69 | - Aims to find the closest vectors to a given query vector 70 | - Employs techniques like locality-sensitive hashing (LSH) and graph-based methods 71 | - Provides a trade-off between search accuracy and computational efficiency 72 | 73 | 14. **How do vector databases handle high-dimensional data?** 74 | - Use specialized index structures like HNSW (Hierarchical Navigable Small World) graphs 75 | - Leverage dimensionality reduction techniques like PCA or t-SNE 76 | - Optimize for efficient storage and retrieval of high-dimensional vectors 77 | 78 | 15. **What are some popular vector database systems used in Generative AI?** 79 | - Pinecone: Offers a managed vector database service with support for ANN search 80 | - Milvus: An open-source vector database with a focus on scalability and performance 81 | - Weaviate: Combines vector search with a GraphQL API for easy integration 82 | 83 | 16. **Can you discuss the role of vector databases in few-shot learning and prompting for Generative AI?** 84 | - Store relevant examples or prompts as vectors 85 | - Retrieve similar examples based on the input prompt or context 86 | - Provide additional information or guidance to the Generative AI model 87 | 88 | 17. **How do vector databases enable efficient retrieval of relevant information for generation tasks?** 89 | - Store generated outputs or relevant information as vectors 90 | - Perform similarity search to find the most relevant vectors based on the input 91 | - Retrieve the corresponding information to guide or enhance the generation process 92 | 93 | 18. **What are some challenges and limitations of using vector databases in Generative AI?** 94 | - Handling dynamic updates and changes to the stored vectors 95 | - Ensuring data privacy and security when storing sensitive information 96 | - Balancing the trade-off between search accuracy and computational efficiency 97 | 98 | 19. **Can you discuss the integration of vector databases with Generative AI models?** 99 | - Seamless integration through APIs or query languages 100 | - Ability to perform vector search and retrieval within the Generative AI pipeline 101 | - Enables end-to-end solutions for tasks like question-answering and dialogue generation 102 | 103 | 20. **What are some future trends and advancements in vector databases for Generative AI?** 104 | - Improved scalability and performance for handling large-scale datasets 105 | - Incorporation of deep learning techniques for better similarity search 106 | - Integration with other AI technologies like knowledge graphs and reasoning engines 107 | 108 | ## Generative AI Fundamentals 109 | 110 | 21. **What are the key differences between discriminative and generative models in machine learning?** 111 | - Discriminative models learn the decision boundary between classes 112 | - Generative models learn the underlying data distribution to generate new samples 113 | 114 | 22. **Can you explain the concept of latent space in generative models?** 115 | - Represents a lower-dimensional space where the model encodes data features 116 | - Enables manipulation of these features to generate new, meaningful samples 117 | 118 | 23. **What are some common evaluation metrics used for assessing the quality of generated samples?** 119 | - Inception Score (IS): Measures the quality and diversity of generated samples 120 | - Fréchet Inception Distance (FID): Compares the statistics of generated samples with real samples 121 | - Human evaluation: Relies on subjective assessments by human judges 122 | 123 | 24. **How do you handle mode collapse in Generative Adversarial Networks (GANs)?** 124 | - Use techniques like mini-batch discrimination and spectral normalization 125 | - Incorporate different loss functions like WGAN-GP 126 | - Employ data augmentation strategies to increase the diversity of training samples 127 | 128 | 25. **Can you discuss the role of Generative AI in few-shot learning and prompt engineering?** 129 | - Generative models can generate relevant examples or prompts for few-shot learning 130 | - Prompts can guide the model to generate outputs that align with specific instructions or styles 131 | - Enables efficient learning from limited data and customization of generated outputs 132 | 133 | 26. **What are some ethical considerations when deploying Generative AI systems?** 134 | - Potential for generating biased or harmful content 135 | - Ensuring transparency and accountability in the decision-making process 136 | - Addressing issues related to data privacy and security 137 | 138 | 27. **How do you assess the quality and consistency of generated outputs from Generative AI models?** 139 | - Employ techniques like hallucination detection and factual consistency checking 140 | - Utilize tools like SelfCheckGPT and G-EVAL for evaluating the quality of generated text 141 | - Combine different evaluation methods to provide a comprehensive assessment 142 | 143 | 28. **Can you discuss the role of Generative AI in multimodal learning and generation?** 144 | - Integrate information from multiple modalities like text, images, and audio 145 | - Leverage cross-modal attention mechanisms to capture interactions between modalities 146 | - Enable generation of outputs in one modality conditioned on inputs from other modalities 147 | 148 | 29. **What are some common challenges and limitations in training Generative AI models?** 149 | - Unstable training dynamics and mode collapse in GANs 150 | - Difficulty in scaling up to high-resolution image generation or long-form text generation 151 | - Ensuring consistent and coherent generation across multiple steps or iterations 152 | 153 | 30. **Can you discuss the potential future advancements and applications of Generative AI?** 154 | - Continued improvements in generation quality and diversity 155 | - Expansion to new domains like video, 3D objects, and interactive environments 156 | - Integration with other AI technologies like reasoning engines and knowledge bases 157 | - Widespread adoption in various industries like entertainment, healthcare, and education 158 | 159 | Remember to tailor your answers based on your specific experience and knowledge, and feel free to explore additional resources to deepen your understanding of Transformer architectures and vector databases in the context of Generative AI. 160 | 161 | Citations: 162 | [1] https://www.usebraintrust.com/hire/interview-questions/generative-ai-specialists 163 | [2] https://www.reddit.com/r/MachineLearning/comments/17u7b19/d_genaillm_interview_prep/ 164 | [3] https://github.com/aishwaryanr/awesome-generative-ai-guide/blob/main/interview_prep/60_gen_ai_questions.md 165 | [4] https://incubity.ambilio.com/top-25-generative-ai-interview-questions-with-answers/ 166 | [5] https://www.youtube.com/watch?v=F1lsFTpsQLI 167 | -------------------------------------------------------------------------------- /GenerativeAI/5. FAISS interview prep.md: -------------------------------------------------------------------------------- 1 | Here are 30 interview questions and answers focused on FAISS (Facebook AI Similarity Search) and its applications in Generative AI, covering various scenarios you might encounter: 2 | 3 | ## FAISS and Its Applications 4 | 5 | 1. **What is FAISS and what are its primary uses?** 6 | - FAISS is a library developed by Facebook AI Research designed for efficient similarity search and clustering of dense vectors. It is primarily used for tasks like nearest neighbor search in high-dimensional spaces, which is essential in applications such as image retrieval, recommendation systems, and natural language processing. 7 | 8 | 2. **How does FAISS handle high-dimensional data?** 9 | - FAISS employs various indexing structures, such as inverted file systems and product quantization, to efficiently manage high-dimensional data. These structures allow for fast approximate nearest neighbor searches while reducing memory usage. 10 | 11 | 3. **What are the different types of indexes available in FAISS?** 12 | - FAISS provides several index types, including: 13 | - Flat Index: Exact nearest neighbor search. 14 | - IVFFlat: Inverted file index with flat quantization for approximate search. 15 | - HNSW: Hierarchical Navigable Small World graph for efficient approximate searches. 16 | - PQ (Product Quantization): Reduces the dimensionality of vectors for faster searches. 17 | 18 | 4. **Can you explain the concept of approximate nearest neighbor (ANN) search in FAISS?** 19 | - ANN search in FAISS aims to find the closest vectors to a query vector quickly without exhaustively comparing all vectors. It uses techniques like clustering and quantization to limit the search space, trading off some accuracy for speed. 20 | 21 | 5. **What are the advantages of using FAISS over other vector search libraries?** 22 | - FAISS is optimized for performance, scalability, and flexibility. It supports large datasets, provides various indexing methods, and is designed to work efficiently on both CPUs and GPUs, making it suitable for high-performance applications. 23 | 24 | 6. **How do you optimize FAISS for large-scale datasets?** 25 | - To optimize FAISS for large datasets, you can: 26 | - Use appropriate index types like IVFPQ or HNSW for faster searches. 27 | - Leverage GPU acceleration for computation-heavy tasks. 28 | - Fine-tune parameters like the number of clusters and quantization levels based on your data characteristics. 29 | 30 | 7. **What is the role of vector embeddings in FAISS?** 31 | - Vector embeddings represent data points in a high-dimensional space, capturing their semantic meanings. In FAISS, these embeddings are used to perform similarity searches, allowing the retrieval of similar items based on their vector representations. 32 | 33 | 8. **Can you describe a scenario where you used FAISS in a project?** 34 | - In a project for an e-commerce platform, I implemented FAISS to enhance the product recommendation system. By indexing product embeddings generated from user interactions, we achieved real-time recommendations based on user preferences, significantly improving user engagement. 35 | 36 | 9. **What challenges did you face while implementing FAISS, and how did you overcome them?** 37 | - One challenge was managing memory usage with large datasets. I addressed this by using product quantization to reduce the memory footprint of the embeddings while maintaining reasonable search accuracy. 38 | 39 | 10. **How does FAISS compare to traditional databases for similarity search?** 40 | - Unlike traditional databases that focus on exact matches and structured queries, FAISS is optimized for high-dimensional vector similarity searches, allowing for approximate matches that are crucial in AI applications like image and text retrieval. 41 | 42 | 11. **What are the typical preprocessing steps before using FAISS?** 43 | - Typical preprocessing steps include: 44 | - Normalizing the vectors to ensure consistent distances. 45 | - Reducing dimensionality if necessary, using techniques like PCA. 46 | - Ensuring that the data is in the correct format for FAISS indexing. 47 | 48 | 12. **How do you evaluate the performance of a FAISS index?** 49 | - Performance can be evaluated using metrics such as: 50 | - Recall: The fraction of relevant items retrieved. 51 | - Precision: The fraction of retrieved items that are relevant. 52 | - Latency: The time taken to perform searches. 53 | 54 | 13. **What is the significance of the `nlist` parameter in FAISS?** 55 | - The `nlist` parameter defines the number of clusters in an inverted file index. A higher `nlist` can improve recall but may increase search time and memory usage. Tuning this parameter is crucial for balancing performance and resource usage. 56 | 57 | 14. **How can FAISS be integrated with machine learning models?** 58 | - FAISS can be integrated with machine learning models by using it to index embeddings generated by those models. For example, after training a neural network to generate embeddings for images, FAISS can be used to perform similarity searches among those embeddings. 59 | 60 | 15. **What is the role of quantization in FAISS?** 61 | - Quantization reduces the precision of vector representations to decrease memory usage and speed up searches. FAISS supports various quantization techniques, such as scalar quantization and product quantization, to optimize performance. 62 | 63 | 16. **Can you explain the concept of "inverted file" indexing in FAISS?** 64 | - Inverted file indexing groups vectors into clusters and maintains a list of vectors for each cluster. This allows FAISS to quickly narrow down the search to a subset of vectors, significantly speeding up the nearest neighbor search process. 65 | 66 | 17. **How do you handle updates to the dataset in FAISS?** 67 | - FAISS allows for dynamic updates by adding or removing vectors from the index. However, for large-scale updates, it may be more efficient to rebuild the index periodically rather than updating it incrementally. 68 | 69 | 18. **What are some common pitfalls when using FAISS?** 70 | - Common pitfalls include: 71 | - Not normalizing vectors, which can lead to inaccurate distance calculations. 72 | - Using inappropriate index types for the data size and search requirements. 73 | - Failing to tune parameters like `nlist` and `nprobe` for optimal performance. 74 | 75 | 19. **How does FAISS support GPU acceleration?** 76 | - FAISS provides a GPU module that allows for the indexing and searching of vectors on NVIDIA GPUs. This significantly speeds up operations, especially for large datasets and complex queries. 77 | 78 | 20. **What is the `nprobe` parameter in FAISS, and how does it affect search results?** 79 | - The `nprobe` parameter determines the number of clusters to search during a query. A higher `nprobe` increases the chances of finding relevant results but also increases search time. Tuning this parameter is essential for balancing speed and accuracy. 80 | 81 | 21. **How can you use FAISS for clustering tasks?** 82 | - FAISS can be used for clustering by applying algorithms like k-means on the vector embeddings. Once clusters are formed, FAISS can efficiently retrieve points belonging to specific clusters or find nearest neighbors within those clusters. 83 | 84 | 22. **What are the trade-offs between using exact and approximate search in FAISS?** 85 | - Exact search guarantees the most accurate results but is computationally expensive and slow for large datasets. Approximate search is faster and uses less memory but may sacrifice some accuracy, making it suitable for real-time applications. 86 | 87 | 23. **Can FAISS be used for text similarity search? If so, how?** 88 | - Yes, FAISS can be used for text similarity search by converting text into embeddings using models like BERT or Sentence Transformers. These embeddings can then be indexed in FAISS for efficient similarity searches. 89 | 90 | 24. **How would you implement a recommendation system using FAISS?** 91 | - To implement a recommendation system: 92 | - Generate embeddings for items and users. 93 | - Index these embeddings using FAISS. 94 | - For a given user, retrieve similar items based on their embedding using FAISS's nearest neighbor search. 95 | 96 | 25. **What is the role of the `metric` parameter in FAISS?** 97 | - The `metric` parameter defines the distance metric used for similarity calculations, such as L2 (Euclidean) or inner product. The choice of metric can significantly affect the search results and should align with the data characteristics. 98 | 99 | 26. **How do you ensure the scalability of FAISS in production environments?** 100 | - Scalability can be ensured by: 101 | - Using distributed computing frameworks to handle large datasets. 102 | - Optimizing index parameters based on the expected load and query patterns. 103 | - Regularly monitoring performance and adjusting configurations as needed. 104 | 105 | 27. **What are some best practices for using FAISS effectively?** 106 | - Best practices include: 107 | - Regularly profiling and benchmarking index performance. 108 | - Experimenting with different index types and parameters. 109 | - Keeping the vector space normalized and well-structured for optimal searches. 110 | 111 | 28. **How do you manage the trade-off between accuracy and speed in FAISS?** 112 | - Manage this trade-off by tuning parameters like `nlist`, `nprobe`, and choosing the right index type based on the specific application requirements. Regular testing and validation against real-world queries can help find the right balance. 113 | 114 | 29. **Can you discuss a specific feature of FAISS that you find particularly useful?** 115 | - One particularly useful feature is the ability to perform multi-threaded searches, which significantly speeds up retrieval times, especially when handling large datasets in real-time applications. 116 | 117 | 30. **What future developments do you foresee for FAISS and vector search technologies?** 118 | - Future developments may include enhanced support for hybrid search combining structured and unstructured data, improved algorithms for dynamic indexing, and better integration with deep learning frameworks for real-time applications. 119 | 120 | These questions and answers should help you prepare effectively for your interview related to FAISS and its applications in Generative AI. 121 | 122 | Citations: 123 | [1] https://incubity.ambilio.com/top-25-generative-ai-interview-questions-with-answers/ 124 | [2] https://www.youtube.com/watch?v=F1lsFTpsQLI 125 | [3] https://www.usebraintrust.com/hire/interview-questions/generative-ai-specialists 126 | [4] https://www.reddit.com/r/MachineLearning/comments/17u7b19/d_genaillm_interview_prep/ 127 | [5] https://blog.streamlit.io/ai-interviewer-customized-interview-preparation-with-generative-ai/ 128 | -------------------------------------------------------------------------------- /GenerativeAI/6.RecursiveReferenceRAG.md: -------------------------------------------------------------------------------- 1 | Sure, here's an example implementation using Python and Langchain to handle document references in a RAG architecture: 2 | 3 | ```python 4 | from langchain.document_loaders import TextLoader 5 | from langchain.embeddings import HuggingFaceEmbeddings 6 | from langchain.vectorstores import Chroma 7 | from langchain.chains import RetrievalQA 8 | from langchain.llms import HuggingFaceHub 9 | 10 | class DocumentReferenceRAG: 11 | def __init__(self, documents): 12 | self.documents = documents 13 | self.embeddings = HuggingFaceEmbeddings() 14 | self.vectorstore = Chroma.from_documents(self.documents, self.embeddings) 15 | self.llm = HuggingFaceHub(repo_id="google/flan-t5-xl") 16 | self.qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", retriever=self.vectorstore.as_retriever()) 17 | 18 | def answer_question(self, question, max_recursion_depth=3): 19 | return self._recursive_answer(question, max_recursion_depth) 20 | 21 | def _recursive_answer(self, question, max_recursion_depth, processed_docs=None): 22 | if processed_docs is None: 23 | processed_docs = set() 24 | 25 | result = self.qa.run(question) 26 | processed_docs.add(result.source_documents[0].metadata['source']) 27 | 28 | for doc in result.source_documents: 29 | if 'referenced_docs' in doc.metadata: 30 | for ref_doc_link in doc.metadata['referenced_docs']: 31 | if ref_doc_link not in processed_docs and max_recursion_depth > 0: 32 | ref_doc = self._retrieve_document(ref_doc_link) 33 | if ref_doc: 34 | self.documents.append(ref_doc) 35 | self.vectorstore = Chroma.from_documents(self.documents, self.embeddings) 36 | self.qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", retriever=self.vectorstore.as_retriever()) 37 | result = self._recursive_answer(question, max_recursion_depth - 1, processed_docs) 38 | break 39 | 40 | return result 41 | 42 | def _retrieve_document(self, doc_link): 43 | # Implement document retrieval logic based on the provided link 44 | # For example, load the document from a file or database 45 | loader = TextLoader(doc_link) 46 | return loader.load()[0] 47 | 48 | # Example usage 49 | doc1 = TextLoader('doc1.txt').load()[0] 50 | doc2 = TextLoader('doc2.txt').load()[0] 51 | doc3 = TextLoader('doc3.txt').load()[0] 52 | doc4 = TextLoader('doc4.txt').load()[0] 53 | doc5 = TextLoader('doc5.txt').load()[0] 54 | 55 | rag = DocumentReferenceRAG([doc1, doc2, doc3, doc4, doc5]) 56 | question = "What is the relationship between document 1 and document 3?" 57 | answer = rag.answer_question(question) 58 | print(answer) 59 | ``` 60 | 61 | In this example: 62 | 63 | 1. The `DocumentReferenceRAG` class is defined to handle the recursive retrieval and processing of documents. 64 | 65 | 2. The `__init__` method initializes the necessary components: 66 | - Loads the initial set of documents 67 | - Creates document embeddings using HuggingFaceEmbeddings 68 | - Stores the documents in a Chroma vector store 69 | - Sets up the LLM (HuggingFaceHub) and RetrievalQA chain 70 | 71 | 3. The `answer_question` method takes a question and an optional maximum recursion depth. It calls the `_recursive_answer` method to generate the answer. 72 | 73 | 4. The `_recursive_answer` method implements the recursive retrieval process: 74 | - Generates an initial answer using the RetrievalQA chain 75 | - Checks if the generated answer references any other documents 76 | - If referenced documents are found, recursively retrieves them using the `_retrieve_document` method 77 | - Appends the retrieved documents to the document collection and updates the vector store and RetrievalQA chain 78 | - Repeats the process until no new referenced documents are found or the maximum recursion depth is reached 79 | 80 | 5. The `_retrieve_document` method is a placeholder for the actual document retrieval logic. In this example, it loads the document from a file using the `TextLoader` from Langchain. 81 | 82 | 6. In the example usage, five documents are loaded, and the `DocumentReferenceRAG` class is instantiated with these documents. 83 | 84 | 7. A question is asked, and the `answer_question` method is called to generate the final answer, considering the referenced documents. 85 | 86 | This implementation demonstrates how to extend a RAG architecture to handle document references using Langchain. The recursive retrieval process ensures that all relevant documents are considered when answering questions, even if they are referenced within other documents. 87 | 88 | Remember to customize the `_retrieve_document` method to match your specific document storage and retrieval mechanism. Additionally, you may want to add more error handling and optimization techniques based on your requirements. 89 | -------------------------------------------------------------------------------- /GenerativeAI/References.md: -------------------------------------------------------------------------------- 1 | FAISS Similarity Search 2 | https://www.youtube.com/playlist?list=PLIUOU7oqGTLhlWpTz4NnuT3FekouIVlqc 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Amogh Singhal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /adjacentElementProduct.py: -------------------------------------------------------------------------------- 1 | # Given an array of integers, find the pair of adjacent elements 2 | # that has the largest product and return that product. 3 | 4 | # Approach 1: (Brute Force) - Check all the pairs in the list and then return the maximum pair 5 | # Time Complexity: O(N^2) 6 | 7 | def adjacentElementProductBF(inputArray): 8 | largestProduct = -999999 9 | 10 | # for sanity check, assert if array contains at least 2 elements 11 | if len(inputArray) < 2: 12 | print("No pairs exists") 13 | return -1 14 | 15 | for i in range(0, len(inputArray)): 16 | for j in range(i+1, len(inputArray)): 17 | currentProduct = inputArray[i]*inputArray[j] 18 | 19 | if currentProduct > largestProduct: 20 | largestProduct = currentProduct 21 | 22 | return largestProduct 23 | 24 | # Approach 2: (Sort & Pick Last Pair) - Sort the list and then pick the last two numbers 25 | # Caveat: All elements must be positive 26 | # Time Complexity: O(Nlog(N)) 27 | 28 | def adjacentElementsProductSort(inputArray): 29 | size = len(inputArray) 30 | 31 | if size < 2: 32 | print("No Pairs exist") 33 | return -1 34 | 35 | sortedArray = sorted(inputArray) 36 | return sortedArray[-1] * sortedArray[-2] 37 | 38 | 39 | def adjacentElementsProduct(inputArray): 40 | 41 | length = int(len(inputArray)) 42 | 43 | maxm = inputArray[0]*inputArray[1] 44 | product = 1 45 | for i in range(1, length-1): 46 | product = inputArray[i]*inputArray[i+1] 47 | 48 | if product>maxm: 49 | maxm = product 50 | 51 | return maxm 52 | 53 | 54 | # print(adjacentElementsProduct([3,6,7,5])) 55 | 56 | print(adjacentElementsProduct([3, 6, -2, -5, 7, 3])) 57 | 58 | #Alternate solution 59 | #return max([inputArray[i]*inputArray[i+1] for i in range(0, int(len(inputArray)-1))]) 60 | -------------------------------------------------------------------------------- /atoi.py: -------------------------------------------------------------------------------- 1 | # Convert the string "123" into 123, without using the built-api `int()` 2 | 3 | # Startegy 4 | # 1. loop through each digit 5 | # 2. find the digit in range object of range(10) 6 | # 3. once the number is found, add it in placeholder 7 | # 4. Multiply each iteration by 10 (start with 0) 8 | 9 | def atoi(inputStr): 10 | outputNum = 0 11 | for char in inputStr: 12 | for i in range(10): 13 | if str(i) == char: 14 | outputNum = outputNum * 10 + i 15 | return outputNum 16 | 17 | x = "123" 18 | y = atoi(x) 19 | print(y) 20 | -------------------------------------------------------------------------------- /binary_search_recursive.py: -------------------------------------------------------------------------------- 1 | # Given an array , find if the number exists 2 | # This is a `recursive` implementation of the 3 | # binary search. If the element is not found 4 | # it returns -1 5 | 6 | 7 | def binarySearch(lst, key, l, r): 8 | if r >= l: 9 | mid = l + (r - l) // 2 10 | # use print(l, mid, r) to view the process 11 | if lst[mid] == key: 12 | return mid 13 | elif lst[mid] < key: 14 | return binarySearch(lst, key, mid + 1, r) 15 | elif lst[mid] > key: 16 | return binarySearch(lst, key, l, mid - 1) 17 | else: 18 | return -1 19 | 20 | 21 | arr = [int(i) for i in range(101)] 22 | print(binarySearch(arr, 67, 0, len(arr) - 1)) 23 | -------------------------------------------------------------------------------- /bits_wilp/Ex2_Numpy_Q1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "846061b9-4829-48a0-a2be-dae592b8f95a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "0901b4de-dd28-4433-ac7b-8db0d9a0f995", 17 | "metadata": {}, 18 | "source": [ 19 | "### Size of Numpy array in bytes" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "2cec0596-d393-479b-bf67-3deddec0ea9e", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdin", 30 | "output_type": "stream", 31 | "text": [ 32 | "Enter integers seperated by space. Press ENTER to end... 34 23 67 89\n" 33 | ] 34 | }, 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Given sequence is\n", 40 | "[34, 23, 67, 89]\n", 41 | "Number of elements in the numpy array: 4\n", 42 | "Total bytes consumed by the numpy array: 16\n", 43 | "Size in bytes of each element in the numpy array: 4\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "try:\n", 49 | " # feed integer array from user\n", 50 | " arr = list(map(int, input(\"Enter integers seperated by space. Press ENTER to end...\").split()))\n", 51 | " print(\"Given sequence is\")\n", 52 | " print(arr)\n", 53 | " # convert Python array to Numpy array\n", 54 | " np_array = np.array(arr, dtype=int)\n", 55 | " print(f\"Number of elements in the numpy array: {np_array.size}\")\n", 56 | " print(f\"Total bytes consumed by the numpy array: {np_array.nbytes}\")\n", 57 | " print(f\"Size in bytes of each element in the numpy array: {(np_array.nbytes)//(np_array.size)}\")\n", 58 | "except ValueError as e:\n", 59 | " print(\"ERROR: Please enter only integers !!!\")\n", 60 | " print(e)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "3e18b28b-796a-428f-96f7-cd530fd0cfd1", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [] 70 | } 71 | ], 72 | "metadata": { 73 | "kernelspec": { 74 | "display_name": "Python 3", 75 | "language": "python", 76 | "name": "python3" 77 | }, 78 | "language_info": { 79 | "codemirror_mode": { 80 | "name": "ipython", 81 | "version": 3 82 | }, 83 | "file_extension": ".py", 84 | "mimetype": "text/x-python", 85 | "name": "python", 86 | "nbconvert_exporter": "python", 87 | "pygments_lexer": "ipython3", 88 | "version": "3.8.5" 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 5 93 | } 94 | -------------------------------------------------------------------------------- /bits_wilp/Ex2_Numpy_Q2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e572d5e4-94a8-4059-bc1f-332e713b02e3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "b525ea5a-5f30-40ac-a98c-40394d788bba", 17 | "metadata": {}, 18 | "source": [ 19 | "### Set Difference Between Two Nupmy Arrays" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 7, 25 | "id": "bdb7936b-b56f-458e-9e72-70eeb7968190", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# Function to take input array from user\n", 30 | "def np_array_frm_input(prompt):\n", 31 | " try:\n", 32 | " arr = list(map(int, input(prompt).split()))\n", 33 | " np_array = np.array(arr, dtype=int)\n", 34 | " except ValueError as e:\n", 35 | " np_array = None\n", 36 | " print(\"ERROR: Please enter only integers !!!\")\n", 37 | " print(e)\n", 38 | " return np_array" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 17, 44 | "id": "1654ed7a-63f9-44bf-a30e-ea85ecd806fd", 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdin", 49 | "output_type": "stream", 50 | "text": [ 51 | "\n", 52 | "Enter numbers for first sequence: 6 4 2\n", 53 | "\n", 54 | "Enter numbers for second sequence: 3 4\n" 55 | ] 56 | }, 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "\n", 62 | "The set difference between [6 4 2] and [3 4] is [2 6]\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "# Return the sorted, unique values in array1 that are not in array2\n", 68 | "np_arr1 = np_array_frm_input(\"\\nEnter numbers for first sequence: \")\n", 69 | "np_arr2 = np_array_frm_input(\"\\nEnter numbers for second sequence: \")\n", 70 | "print(f\"\\nThe set difference between {np_arr1} and {np_arr2} is {np.setdiff1d(np_arr1, np_arr2)}\")" 71 | ] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.8.5" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 5 95 | } 96 | -------------------------------------------------------------------------------- /bits_wilp/Ex2_Numpy_Q3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "3b75f73e-6d1b-4a9e-935d-4a3d229bcc6a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "65b9b866-0960-40d1-8deb-4ebe1261d420", 17 | "metadata": {}, 18 | "source": [ 19 | "### Cross Product of two given vectors" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "1a98a54e-7a31-4575-84ec-c72c4aadb8a4", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# Function to take input array from user\n", 30 | "def np_array_frm_input(prompt):\n", 31 | " try:\n", 32 | " arr = list(map(int, input(prompt).split()))\n", 33 | " np_array = np.array(arr, dtype=int)\n", 34 | " except ValueError as e:\n", 35 | " np_array = None\n", 36 | " print(\"ERROR: Please enter only integers !!!\")\n", 37 | " print(e)\n", 38 | " return np_array" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "id": "f540a81d-f8e5-44c1-a280-ac96211ea7c4", 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdin", 49 | "output_type": "stream", 50 | "text": [ 51 | "Enter the first integer vector... 3\n", 52 | "Enter the second integer vector... 4 5\n" 53 | ] 54 | }, 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "ERROR: Vectors must be 2D pr 3D for computing cross-product !!!\n", 60 | "incompatible dimensions for cross product\n", 61 | "(dimension must be 2 or 3)\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "try:\n", 67 | " vec1 = np_array_frm_input(\"Enter the first integer vector...\")\n", 68 | " vec2 = np_array_frm_input(\"Enter the second integer vector...\")\n", 69 | " print(f\"The cross product between {vec1} and {vec2} is {np.cross(vec1, vec2)}\")\n", 70 | "except ValueError as e:\n", 71 | " print(\"ERROR: Vectors must be 2D pr 3D for computing cross-product !!!\")\n", 72 | " print(e)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "a7c915f7-e509-46e9-843a-c891fd2e64a3", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.8.5" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 5 105 | } 106 | -------------------------------------------------------------------------------- /bits_wilp/Ex2_Numpy_Q4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8ddc2bc7-042f-4552-92a3-f9047a132e56", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "30c3a8f2-797e-4427-9acd-0ea951421e92", 17 | "metadata": {}, 18 | "source": [ 19 | "### Determinant of a square array" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 22, 25 | "id": "16a039d2-473c-4b23-8d08-108b0edae19b", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdin", 30 | "output_type": "stream", 31 | "text": [ 32 | "Enter size of square array: 2\n", 33 | "Enter a square array in row-wise manner: 3 4 2 2\n" 34 | ] 35 | }, 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "\n", 41 | "SUCCESS: Reshape operation completed\n", 42 | "Given square array is\n", 43 | "[[3 4]\n", 44 | " [2 2]]\n", 45 | "\n", 46 | "The approx. determinant of the above square array is -2.0\n", 47 | "The absolute determinant of the above square array is -1.9999999999999998\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "arr_shape = int(input(\"Enter size of square array: \"))\n", 53 | "arr = list(map(int,input(\"Enter a square array in row-wise manner: \").split()))\n", 54 | "\n", 55 | "if len(arr) == arr_shape**2 :\n", 56 | " sq_arr = np.array(arr, dtype=int).reshape(arr_shape, arr_shape)\n", 57 | " print(\"\\nSUCCESS: Reshape operation completed\")\n", 58 | " print(f\"Given square array is\")\n", 59 | " print(sq_arr)\n", 60 | " print(f\"\\nThe approx. determinant of the above square array is {round(np.linalg.det(sq_arr),0)}\")\n", 61 | " print(f\"The absolute determinant of the above square array is {np.linalg.det(sq_arr)}\")\n", 62 | "else:\n", 63 | " print(f\"ERROR: Cannot reshape array of size {len(arr)} into {(arr_shape, arr_shape)}\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "35643ed3-5bfe-443a-adf0-df266ba9da72", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.8.5" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 5 96 | } 97 | -------------------------------------------------------------------------------- /bits_wilp/Ex2_Numpy_Q5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "678ce984-841d-49bd-b2c5-8586aaed05b2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "c8866271-04d5-4bd6-9916-f55bc5889968", 17 | "metadata": {}, 18 | "source": [ 19 | "### Eigenvalues and Eigenvectors of a square array" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "id": "346bcf67-e5be-49d2-8e22-31d87ed416cd", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdin", 30 | "output_type": "stream", 31 | "text": [ 32 | "Enter size of square array: 2\n", 33 | "Enter a square array in row-wise manner: 1 2 2 1\n" 34 | ] 35 | }, 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "\n", 41 | "SUCCESS: Reshape operation completed\n", 42 | "Given square array is\n", 43 | "[[1 2]\n", 44 | " [2 1]]\n", 45 | "The eigenvalues of the above square array is\n", 46 | "[ 3. -1.]\n", 47 | "The eigenvectors of the above square array is\n", 48 | "[[ 0.70710678 -0.70710678]\n", 49 | " [ 0.70710678 0.70710678]]\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "arr_shape = int(input(\"Enter size of square array: \"))\n", 55 | "arr = list(map(int,input(\"Enter a square array in row-wise manner: \").split()))\n", 56 | "\n", 57 | "if len(arr) == arr_shape**2 :\n", 58 | " sq_arr = np.array(arr, dtype=int).reshape(arr_shape, arr_shape)\n", 59 | " print(\"\\nSUCCESS: Reshape operation completed\")\n", 60 | " print(f\"Given square array is\")\n", 61 | " print(sq_arr)\n", 62 | " \n", 63 | " eig_val, eig_vec = np.linalg.eig(sq_arr)\n", 64 | " print(\"The eigenvalues of the above square array is\")\n", 65 | " print(eig_val)\n", 66 | " print(\"The eigenvectors of the above square array is\")\n", 67 | " print(eig_vec)\n", 68 | "else:\n", 69 | " print(f\"ERROR: Cannot reshape array of size {len(arr)} into {(arr_shape, arr_shape)}\")" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "897a5000-e52d-4a07-afc1-2b722ec423e5", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.8.5" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 5 102 | } 103 | -------------------------------------------------------------------------------- /bits_wilp/Ex2_Numpy_Q6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "6fbe566f", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "a0d6b1f1", 17 | "metadata": {}, 18 | "source": [ 19 | "### Matrix Multiplication" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 14, 25 | "id": "77a499e1", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "First matrix:\n", 33 | " [[4 3 3]\n", 34 | " [2 4 2]\n", 35 | " [0 1 4]]\n", 36 | "\n", 37 | "Second matrix:\n", 38 | " [[4 8 1]\n", 39 | " [0 2 3]\n", 40 | " [6 6 2]]\n", 41 | "\n", 42 | "Product of the two matrix\n", 43 | "[[34 56 19]\n", 44 | " [20 36 18]\n", 45 | " [24 26 11]]\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "def matgen3d():\n", 51 | " \"\"\"\n", 52 | " Function to generate a random integer 3x3 matrix\n", 53 | " Min value: 1, Max value: 9\n", 54 | " \"\"\"\n", 55 | " return np.random.randint(low=0, high=10, size=(3,3))\n", 56 | "\n", 57 | "mat1 = matgen3d()\n", 58 | "mat2 = matgen3d()\n", 59 | "\n", 60 | "print(\"First matrix:\\n\", mat1)\n", 61 | "print()\n", 62 | "print(\"Second matrix:\\n\", mat2)\n", 63 | "print()\n", 64 | "print(\"Product of the two matrix\")\n", 65 | "print(np.matmul(mat1, mat2))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "2b277c10", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "Python 3", 80 | "language": "python", 81 | "name": "python3" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 3 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython3", 93 | "version": "3.8.5" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 5 98 | } 99 | -------------------------------------------------------------------------------- /bits_wilp/Quiz 1_ S2-20_DSECLPFDS.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devAmoghS/Python-Interview-Problems-for-Practice/5e2189c50841bb9ed54807986b8dd010590fd9b5/bits_wilp/Quiz 1_ S2-20_DSECLPFDS.pdf -------------------------------------------------------------------------------- /bits_wilp/binomialCoefficient.py: -------------------------------------------------------------------------------- 1 | def factorial(n): 2 | if n < 2: 3 | return 1 4 | else: 5 | return n * factorial(n-1) 6 | 7 | 8 | n = int(input("Enter the value of n: ")) 9 | r = int(input("Enter the value of r: ")) 10 | 11 | # int division to avoid `float` output 12 | ncr = factorial(n) // (factorial(r) * factorial(n-r)) 13 | 14 | # string formatting 15 | result = "The binomial coefficient for {n} and {r} is {ncr}".format(n=n, r=r, ncr=ncr) 16 | print(result) 17 | -------------------------------------------------------------------------------- /bits_wilp/calculateFrequency.py: -------------------------------------------------------------------------------- 1 | def calculateFrequency(str): 2 | nums = str.split() 3 | freq = {} 4 | 5 | for i in nums: 6 | if i not in freq.keys(): 7 | freq[i] = 1 8 | else: 9 | freq[i] += 1 10 | 11 | print(freq) 12 | 13 | 14 | print("Please enter the numbers seperated by space. \n Press ENTER to exit: ") 15 | x = input() 16 | calculateFrequency(x) 17 | -------------------------------------------------------------------------------- /bits_wilp/isAngstrom.py: -------------------------------------------------------------------------------- 1 | def isAngstrom(n): 2 | result = False 3 | sum = 0 4 | order = len(n) 5 | 6 | for i in n: 7 | sum = sum + int(i)**order 8 | 9 | if sum == int(n): 10 | result = True 11 | return result 12 | 13 | print("Please enter a number: ") 14 | num = input() 15 | flag = isAngstrom(num) 16 | 17 | if flag: 18 | print(num, "is an Angstrom number") 19 | else: 20 | print(num, "is NOT an Angstrom number") 21 | -------------------------------------------------------------------------------- /bits_wilp/isPalindrome.py: -------------------------------------------------------------------------------- 1 | def isPalindrome(str): 2 | result = False 3 | 4 | if str == str[::-1]: 5 | result = True 6 | 7 | return result 8 | 9 | print("Please enter a string: ") 10 | x = input() 11 | flag = isPalindrome(x) 12 | 13 | if flag: 14 | print(x, "is a Palindrome") 15 | else: 16 | print(x, "is NOT a Palindrome") 17 | -------------------------------------------------------------------------------- /bits_wilp/practice.py: -------------------------------------------------------------------------------- 1 | def most_common(str_a, str_b): 2 | return set(str_a) & set(str_b) 3 | 4 | result = most_common("NAINA", "RENNE") 5 | print(result) # {N} 6 | 7 | def get_freq(str): 8 | freq_dict = {} 9 | for char in str.split(): 10 | if char not in freq_dict.keys(): 11 | freq_dict[char] = 1 12 | else: 13 | freq_dict[char] += 1 14 | 15 | return freq_dict 16 | 17 | result = get_freq("Amogh loves to eat apple and mango. His sister also loves eating apple and mango") 18 | print(result) 19 | # {'Amogh': 1, 'loves': 2, 'to': 1, 'eat': 1, 'apple': 2, 'and': 2, 'mango.': 1, 'His': 1, 'sister': 1, 'also': 1, 'eating': 1, 'mango': 1} 20 | 21 | 22 | 23 | def is_prime(num): 24 | flag = 1 25 | 26 | for i in range(2, num//2): 27 | if num % i == 0: 28 | flag = 0 29 | break 30 | 31 | if flag == 0: 32 | print(f"{num} is not prime...") 33 | else: 34 | print(f"{num} is prime...") 35 | 36 | is_prime(7919) # 7919 is prime 37 | 38 | 39 | def fibo_iter(n_terms): 40 | first, second = 0, 1 41 | for i in range(0, n_terms): 42 | if i <= 1: 43 | result = i 44 | else: 45 | result = first + second 46 | first = second 47 | second = result 48 | print(result, end=' ') 49 | 50 | fibo_iter(5) # 0 1 1 2 3 51 | 52 | def fibo_recur(n): 53 | if n == 0: 54 | return 0 55 | elif n == 1: 56 | return 1 57 | else: 58 | return fibo_recur(n-1) + fibo_recur(n-2) 59 | 60 | 61 | for i in range(0, 10): 62 | print(fibo_recur(i), end=' ') # 0 1 1 2 3 5 8 13 21 34 63 | -------------------------------------------------------------------------------- /bits_wilp/primeFactorization.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | 3 | def get_prime_factors(num): 4 | factors = [] 5 | 6 | # get all the 2's 7 | while num % 2 == 0: 8 | factors.append(2) 9 | num = num / 2 10 | 11 | # check for other prime factors 12 | # sqrt is used to reduce the range by log(n) 13 | # step size of 2 to avoid checking with even numbers 14 | for i in range(3, int(sqrt(num))+1, 2): 15 | while num % i == 0: 16 | # print(num, i) 17 | factors.append(i) 18 | num = num / i 19 | 20 | # num is now the last prime number 21 | if num > 2: 22 | factors.append(int(num)) 23 | 24 | return factors 25 | 26 | 27 | n = int(input("Enter the number: ")) 28 | result = get_prime_factors(n) 29 | 30 | print("The factors of {n} are {result}".format(n=n, result=result)) 31 | 32 | # Enter the number: 1081310109 33 | # The factors of 1081310109 are [3, 11, 17, 23, 181, 463] 34 | -------------------------------------------------------------------------------- /bits_wilp/sample.txt: -------------------------------------------------------------------------------- 1 | What is Lorem Ipsum? 2 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. 3 | 4 | Why do we use it? 5 | It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like). 6 | 7 | 8 | Where does it come from? 9 | Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32. 10 | 11 | The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham. 12 | -------------------------------------------------------------------------------- /bits_wilp/searching.py: -------------------------------------------------------------------------------- 1 | 2 | # Searching: Given a sorted array arr[] of n elements, write a 3 | # function to search a given element x in arr[]. Do it using linear 4 | # and binary search techniques. 5 | 6 | def linear_search(arr, elem): 7 | for i in range(0,len(arr)): 8 | if arr[i] == elem: 9 | print("{elem} found at {index} !".format(elem=elem, index=i)) 10 | return 11 | print("{elem} not found in given sequence".format(elem=elem)) 12 | 13 | def binary_search(arr, elem, l, r): 14 | try: 15 | mid = (l + r) // 2 16 | 17 | if elem == arr[mid]: 18 | print("{elem} found at {index} !".format(elem=elem, index=mid)) 19 | return 20 | elif elem > arr[mid]: 21 | binary_search(arr, elem, l=mid+1, r=r) 22 | elif elem < arr[mid]: 23 | binary_search(arr, elem, l=l, r=mid-1) 24 | 25 | except RecursionError as e: 26 | print("{elem} not found in given sequence".format(elem=elem)) 27 | print(e) 28 | 29 | 30 | arr = list(map(int, input("Enter numbers seperated by space. Press ENTER to exit: ").split())) 31 | arr.sort() 32 | print("Given sequence is :") 33 | print(arr) 34 | 35 | elem = int(input("Enter element to be searched: ")) 36 | choice = int(input("Choose search method: \n 1. Linear Search \n 2. Binary Search \n")) 37 | 38 | if choice == 1: 39 | linear_search(arr, elem) 40 | elif choice == 2: 41 | binary_search(arr, elem, l=0, r=len(arr)-1) 42 | else: 43 | print("Error: Please enter a valid choice !") 44 | -------------------------------------------------------------------------------- /bits_wilp/sumOfDigits.py: -------------------------------------------------------------------------------- 1 | def sumOfDigits(n): 2 | sum = 0 3 | while n > 0: 4 | rem = n % 10 5 | sum = sum + rem 6 | n = n // 10 7 | return sum 8 | 9 | print("Please enter a number: ") 10 | num = int(input()) 11 | sod = sumOfDigits(num) 12 | print("The sum of digits for", num, "is", sod) 13 | -------------------------------------------------------------------------------- /bits_wilp/topThreeFrequent.py: -------------------------------------------------------------------------------- 1 | filepath = './sample.txt' 2 | freq_counter = {} 3 | 4 | # using context manager 5 | with open(filepath, mode='r') as handle: 6 | content = handle.read() 7 | words = content.split() 8 | # file is closed now 9 | 10 | for w in words: 11 | if w not in freq_counter.keys(): 12 | freq_counter[w] = 1 13 | else: 14 | freq_counter[w] += 1 15 | 16 | # sorting the sequence by values in `reverse` order 17 | sorted_by_freq = sorted(freq_counter, key=freq_counter.get, reverse=True) 18 | 19 | top_three = sorted_by_freq[:3] 20 | for k in top_three: 21 | print(k, ":", freq_counter[k]) 22 | -------------------------------------------------------------------------------- /bresenham_line_algorithm.py: -------------------------------------------------------------------------------- 1 | # Bresenham Line Algorithm (BLA) is one of the earliest algorithms developed 2 | # in computer graphics. It is used for drawing lines. It is an efficient method because 3 | # it involves only integer addition, subtractions, and multiplication operations. 4 | 5 | # These operations can be performed very rapidly so lines can be generated quickly. 6 | 7 | # Reference: http://floppsie.comp.glam.ac.uk/Southwales/gaius/gametools/6.html 8 | 9 | # Algorithm: 10 | # 1. We are given the starting and ending point (x1, y1) and (x2, y2) 11 | # 2. We compute the gradient m, using the formula: m = (y2-y1)/(x2-x1) 12 | # 3. The equation of the straight line is y = m*x+c. So the next thing we need to find is the intercept c 13 | # 4. Intercept can be derived using the formula c = y1 - m*x1 14 | # 5. To get the next point, we add dx to the x-cordinate and dy to the y cordinate 15 | # 6. We continue this cycle until we reach (x2, y2) 16 | 17 | def lineGenerator(x1, y1, x2, y2): 18 | dx = x2 - x1 19 | dy = y2 - y1 20 | 21 | slope = 2*dy - dx 22 | 23 | x = x1 24 | y = y1 25 | while x < x2: 26 | 27 | #Print current coordinates 28 | print(x, y) 29 | 30 | #X increases any ways 31 | x+= 1 32 | 33 | # 2dy is always added in the slope. Do it. 34 | slope += 2*dy 35 | #Check for the current slope 36 | if slope >= 0: 37 | y += 1 38 | slope -= 2 * (x2-x1) 39 | 40 | elif slope <=0: 41 | #No changes are made. 42 | slope = slope 43 | 44 | 45 | # lineGenerator(3, 2, 15, 5) 46 | 47 | # # P1 is the point given. Initial point 48 | # # P2 is the point to reach. Final point 49 | # if P1[0] == P2[0] and P1[1] == P2[1]: 50 | # return 0 51 | # print(P1) 52 | # #Check if the point is above or below the line. 53 | # dx = P2[0]-P1[0] 54 | # dy = P2[1]-P1[0] 55 | 56 | # di = 2*dy - dx 57 | 58 | # currX = P1[0] 59 | # currY = P1[1] 60 | 61 | # if di > 0: 62 | # P1 = (currX+1, currY+1) 63 | # else: 64 | # P1 = (currX+1, currY) 65 | 66 | # return lineGenerator(P1, P2) 67 | -------------------------------------------------------------------------------- /bst_nodes_in_range.py: -------------------------------------------------------------------------------- 1 | # Problem: Find the no. of nodes in a BST that lies in a given range 2 | 3 | # Algorithm: We will traverse the tree recursively until we encounter leaf nodes (Base case) 4 | # else we do the following 5 | # 1. Current node is less than the given range --> Traverse the right subtree 6 | # 2. Current node is more than the given range --> Traverse the left subtree 7 | # 3. Current node lies in the given range --> Increment Count; Traverse both the left and right subtree 8 | 9 | # Data Structure for Tree Node 10 | class Node: 11 | def __init__(self, data): 12 | self.data = data 13 | self.left = None 14 | self.right = None 15 | 16 | 17 | def nodesWithinRange(root, range): 18 | low, high = range 19 | # this is the base case 20 | if root is None: 21 | return 0 22 | # optional: to improve efficiency 23 | elif root.data == high or root.data == low: 24 | return 1 25 | # if the current node lies in the range 26 | elif root.data <= high and root.data >= low: 27 | return ( 28 | 1 + nodesWithinRange(root.left, range) + nodesWithinRange(root.right, range) 29 | ) 30 | # if the current node lies in left subtree 31 | elif root.data > high: 32 | return nodesWithinRange(root.left, range) 33 | # if the current node lies in the right subtree 34 | elif root.data < low: 35 | return nodesWithinRange(root.right, range) 36 | 37 | if __name__ == "main": 38 | 39 | node = Node(10) 40 | node.left = Node(5) 41 | node.left.left = Node(1) 42 | node.right = Node(50) 43 | node.right.left = Node(45) 44 | node.right.right = Node(100) 45 | 46 | result = nodesWithinRange(node, (5, 45)) 47 | print(result) 48 | -------------------------------------------------------------------------------- /bubble_sort.py: -------------------------------------------------------------------------------- 1 | # A simple implementation of bubble sort 2 | 3 | def bubbleSort(arr): 4 | # travese the whole array 5 | for i in range(len(arr)): 6 | 7 | # last i elements are already in place 8 | for j in range(0, len(arr)-i-1): 9 | 10 | if arr[j] > arr[j+1]: 11 | arr[j], arr[j+1] = arr[j+1], arr[j] 12 | 13 | return arr 14 | 15 | # Approach 2: This algorithm will run for O(n^2) even if the array is 16 | # already sorted. For avoiding this, we can check if elements are swapped 17 | # in each pass. We will break the loop in case they are not 18 | 19 | # Time Complexity: O(n^2) - Average or Worst Case; O(n) - Best case [Array is already sorted] 20 | 21 | def bubbleSortOptimized(arr): 22 | for i in range(len(arr)): 23 | swapped = False 24 | 25 | for j in range(0, len(arr)-i-1): 26 | 27 | if arr[j] > arr[j+1]: 28 | arr[j], arr[j+1] = arr[j+1], arr[j] 29 | swapped = True 30 | 31 | # if no elements are swapped, break the loop 32 | if swapped == False: 33 | break 34 | 35 | return arr 36 | 37 | if __name__ = "__main__": 38 | arr = [2, 6, 1, 5, 3, 4] 39 | res = bubbleSort(arr) 40 | print(res) 41 | -------------------------------------------------------------------------------- /calculateClockAngle.py: -------------------------------------------------------------------------------- 1 | # Find the angle made by the hour hand and the minute 2 | # hand at any given time. Assume it is an analog clock 3 | 4 | def calculateAngle(hour, minute): 5 | if hour < 0 or minute < 0 or hour > 12 or minute > 60: 6 | print("Wrong inputs given...") 7 | return 8 | else: 9 | 10 | if hour == 12: 11 | hour = 0 12 | if minute == 60: 13 | minute = 0 14 | 15 | # hour hand moves 360° in 12 hours i.e. 16 | # 360/12*60 ==> 0.5° every minute 17 | 18 | # similarly minute hand moves 360° in 1 hour i.e. 19 | # 360/60 ==> 6° every minute 20 | 21 | hour_angle = (hour * 60 + minute) * 0.5 22 | minute_angle = minute * 6 23 | 24 | # We take the absolute difference 25 | # and then return the acute angle between the two 26 | difference = abs(hour_angle - minute_angle) 27 | 28 | return min(difference, 360 - difference) 29 | 30 | input_time = (9, 30) 31 | print("The angle between hour and minute hand is: ", calculateAngle(input_time[0], input_time[1]), '\u00b0') 32 | -------------------------------------------------------------------------------- /check_anagrams.py: -------------------------------------------------------------------------------- 1 | # Problem: Two strings of sizes m and n are given, 2 | # we have to find how many characters need to be 3 | # removed from both the string so that they become 4 | # anagrams of each other 5 | 6 | # Anagrams: Words that are made from rearranging the letters of another word 7 | 8 | # Algorithm: We will use dictionaries to keep track of characters. 9 | # The idea is to get the common occuring characters and derive 10 | # uncommon characters 11 | 12 | import string 13 | 14 | # letters will be a string of the form "abc...xyz" 15 | # CHARACTER_HASH looks like this {'a'0, 'b':0 ...., 'z':0} 16 | letters = string.ascii_lowercase 17 | CHARACTER_HASH = dict(zip(letters, [0] * len(letters))) 18 | 19 | 20 | # This method will mark all the letters occuring in 'text_a' 21 | def mapLettersToHash(text_a): 22 | for char in text_a: 23 | if char in CHARACTER_HASH.keys(): 24 | CHARACTER_HASH[char] += 1 25 | 26 | 27 | # This method will count the letters present in 'text_b', also found in 'text_a' 28 | # These will be charcaters whose frequency in HASH is greater than zero 29 | def computeCommonLetters(text_b): 30 | common_letters = 0 31 | for char in text_b: 32 | if CHARACTER_HASH[char] > 0: 33 | common_letters += 1 34 | return common_letters 35 | 36 | 37 | # Now we derive how many uncommon letters are present, 38 | # This is done by subtracting twice the count of common letters 39 | # from the total length of both the strings 40 | def computeUncommonLetters(text_a, text_b, common_letters): 41 | return abs(len(text_a) + len(text_b) - (2 * common_letters)) 42 | 43 | if __name__ == "__main__": 44 | text_1 = "hello" 45 | text_2 = "billion" 46 | 47 | mapLettersToHash(text_1) 48 | common = computeCommonLetters(text_2) 49 | result = computeUncommonLetters(text_1, text_2, common) 50 | print(result) 51 | -------------------------------------------------------------------------------- /check_semiprime.py: -------------------------------------------------------------------------------- 1 | # Context: A semiprime is a product of two prime 2 | # numbers, not necessarily distinct. 3 | # Squares of prime numbers are also semiprimes. 4 | 5 | # Problem: Find the numbers which are semiprimes, 6 | # within a given range. For e.g. 1 to 100. 7 | 8 | 9 | def isSemiprime(num): 10 | # start with the smallest prime 11 | prime = 2 12 | # initialize counter to 0 13 | count = 0 14 | # Design of while loop: 15 | # 1. if count exceeds 2, it is not a semiprime, e.g. 30 = 2*3*5 16 | # 2. when the number becomes 1, we have found the second prime 17 | while count < 3 and num != 1: 18 | # if the number is divisible by current prime, 19 | # increment count, else move to new prime 20 | if not (num % prime): 21 | num = num / prime 22 | count = count + 1 23 | else: 24 | prime = prime + 1 25 | # if count is two, given number is a semiprime 26 | return count == 2 27 | 28 | 29 | for i in range(1, 100): 30 | if isSemiprime(i): 31 | print(i, end=" ") 32 | 33 | # Result: 4 6 9 10 14 15 21 22 25 26 33 34 35 38 39 46 49 34 | # 51 55 57 58 62 65 69 74 77 82 85 86 87 91 93 94 95 35 | -------------------------------------------------------------------------------- /data_science_interviews.md: -------------------------------------------------------------------------------- 1 | What is PEP 8 and why is it important? 2 | 3 | What is Scope in Python? 4 | 5 | What are lists and tuples? What is the key difference between the two? 6 | 7 | What are modules and packages in Python? 8 | 9 | What is self in Python? 10 | 11 | What are decorators in Python? 12 | 13 | What is lambda in Python? Why is it used? 14 | 15 | What are generators in Python? 16 | 17 | Can you create a series from the dictionary object in pandas? 18 | 19 | How will you delete indices, rows, and columns from a data frame? 20 | 21 | Can you get items of series A that are not available in another series B? 22 | 23 | How are NumPy arrays advantageous over python lists? 24 | 25 | Write python function which takes a variable number of arguments. 26 | 27 | WAP (Write a program) which takes a sequence of numbers and checks if all numbers are unique. 28 | 29 | **************************************************************************** 30 | 31 | How do we use Eigenvalues and eigenvectors in PCA (Principal Components Analysis) ? 32 | 33 | Difference between exogenous and auto regression in time series forecasting. 34 | 35 | Difference between normalization and standardization, will it be used before train test split or after? 36 | 37 | How to reduce the impact of one feature than others 38 | 39 | Difference between XGBoost and FBProphet. 40 | 41 | Describe the scenario where you do not make stationery data in time series forecasting problem 42 | 43 | BERT is trained on which dataset? What model will be used if BERT does not exist? Describe self- attention mechanism. 44 | 45 | Difference between univariate and multivariate time series forecasting problems. 46 | **************************************************************************** 47 | 48 | Find the middle node of a given LinkedList. 49 | `Used two pointer approach` 50 | `Slow Pointer = node.next`, and 51 | `Fast pointer = node.next.next;` 52 | at each iteration check if any of the pointer equals to `null` 53 | When fast pointer is null slow pointer will be at the middle node just print node.data to get the result. 54 | 55 | 56 | Print all the permutations of give string. 57 | There are two approaches for this either 58 | we can use `permute` library or 59 | we can code using loops in `O(n^2)`. 60 | 61 | 62 | Third last node of LinkedList, 63 | above mentioned two pointer approach will be used here as well. 64 | 65 | Difference between `call by value` and `call by reference`. 66 | In call by value, we pass the copy of variable in the function whereas 67 | in call by reference we pass the actual variable into the function. 68 | How we do that? We pass the memory address of that variable to the function. 69 | These concepts are used with pointers in C/C++. 70 | 71 | Difference between `==` and `===` in JavaScript. 72 | Both are used for comparison 73 | double equal to is a content comparator whereas 74 | triple equals compares both content and data types of LHS & RHS. 75 | 76 | Difference between Breadth-first search & Depth first search. 77 | 78 | 79 | **************************************************************************** 80 | Explanation of the past project. What were the features used and how did you determine performance? 81 | 82 | What is the difference between linear regression and logistic regression? 83 | 84 | What is the internal working of logistic regression (LR)? 85 | 86 | What is the loss function of LR? 87 | 88 | Name some hyperparameters used in LR? Why do we use regularization? 89 | 90 | When do we use accuracy as a metric? When should we not use accuracy? 91 | 92 | How do you deal with imbalance data? 93 | 94 | What is SMOTE and how is it different from stratified sampling? 95 | 96 | Watch this video to understand how SMOTE works [https://www.youtube.com/watch?v=U3X98xZ4_no] 97 | 98 | What is better 0.51 AUC (Area Under the Curve) or 0.43 F1 score? Which one should you present to a client? 99 | 100 | Watch this video to understand how AUC is interpreted [https://www.youtube.com/watch?v=mUMd_cKU0VM] 101 | 102 | What does the ROC AUC value signify? 103 | 104 | Do we only use the threshold of 0.5 or can we use other thresholds in LR? If yes, how do we find them? 105 | 106 | Can I use a sales forecasting model built using pencils data to be used in erasers data? 107 | 108 | How would you compare the performance of two forecasting models? 109 | 110 | What are the different metrics used in regression analysis? Which metric should be used where? 111 | 112 | How do you build a testing pipeline for a data science model? [https://www.kdnuggets.com/2020/08/unit-test-data-pipeline-thank-yourself-later.html] 113 | 114 | **************************************************************************** 115 | 116 | How does Iterators and generators work in Python ? 117 | 118 | What does Python constructors do and how are they useful ? 119 | 120 | Explain what Map function does in Python ? 121 | 122 | How do you flatten an image(matrix) in a deep learning architecture ? 123 | 124 | Difference between semantic segmentation and instance segmentation ? 125 | 126 | Which are the different types of pooling operations - what is the visual effect of applying a max pooling operation and average pooling operation on an image ? 127 | 128 | What is the math behind convolution operation – what will be the size of a particular image (128*128) after convolution operation with a 3*3 kernel ? 129 | 130 | what will be the size of a particular image (128*128) after convolution operation for a 3*3 image after applying 1*1 kernel ? 131 | 132 | What is the Loss function and optimization function of region proposal network ? 133 | 134 | What is Image down sampling – why do we do down sampling ? 135 | 136 | Python coding: Solve the following using a for loop, by defining a function and put in inside a class 137 | 138 | `#Input : a =[1,2,3] ` 139 | 140 | `#Output : ["hello1","hello2","hello3"]` 141 | 142 | Tradeoff between YOLO and FasterRCNN in terms of speed and accuracy ? 143 | 144 | What are feature maps and how are they obtained ? 145 | 146 | **************************************************************************** 147 | How will you count unique values in a data frame column. 148 | 149 | How will you convert a column data type to string ? 150 | 151 | How will you obtain correlation coefficient between 2 columns in a data frame ? 152 | 153 | How will you merge two data frame based on common column (when column name is same) ? 154 | 155 | How will you merge you merge two data frames base on common column name (column name is different in left and right data frame) ? 156 | 157 | Define the term correlation with respect to statistics ? 158 | 159 | What are the types of correlation coefficient? 160 | 161 | What is the difference in Pearson correlation coefficient and spearmen correlation coefficient? 162 | 163 | How do we deal with categorical variables for statistical analysis? 164 | 165 | How do you obtain correlation between 2 categorical variables? 166 | How do you find Correlation between one categorical variable and other numerical variables? 167 | 168 | What is the difference between dictionary and list? 169 | 170 | How do you append a dictionary with another dictionary? 171 | 172 | What is the difference between tuples and list ? 173 | 174 | Can a tuple have different data types of element contained within it ? 175 | 176 | How do you read data from database directly and convert it into data frame for analysis? 177 | 178 | How do you import file.py function into another python file ? 179 | 180 | What are generators in python ? 181 | 182 | How will you print index and values of a list without range function ? 183 | 184 | **************************************************************************** 185 | 186 | What is the difference between Docker and Containers? 187 | 188 | How do you restart containers on failure? 189 | 190 | How do you run a container in Docker? 191 | 192 | Can you run a program that takes 4 hours to run in AWS Lambda? 193 | 194 | What is the difference between ADD and COPY commands wrt. Dockerfile ? 195 | 196 | Experience with different AWS services such as CloudFormation or Glue? 197 | 198 | What is the schema in S3? 199 | 200 | Can the lambda written in AWS interact with other infrastructure? 201 | 202 | What is the Dockerfile setup if you want to expose the model as an API? 203 | 204 | Difference between UDF, pandas UDF and pyspark UDFs? 205 | 206 | Difference between synchronous and asynchronous request? How do you program one in Python? 207 | 208 | What is the use of a DAG (Directed Acyclic Graph) in Spark? 209 | 210 | Given the no. Of terms, print the Fibonacci sequence: Hint try both iterative and recursive methods [https://www.programiz.com/python-programming/examples/fibonacci-sequence] 211 | 212 | Given an input string, print the length of the longest common substring without any repeating characters. [https://leetcode.com/problems/longest-substring-without-repeating-characters/] 213 | 214 | Given an input string, write a function that returns the Run Length Encoded string for the input string. For example, if the input string is “ssslbbbbppiitttc”, then the function should return “s3l1b4p2i2t3c1” 215 | 216 | **************************************************************************** 217 | 218 | Given a list, `ls = [9,8,3,4,1,0,2,7,7,6]`, write a function to get nth highest element without using any inbuilt functions or sorting. 219 | 220 | Write a python class with method to sort a list and related questions on classes, static methods, init etc. 221 | 222 | Difference between `RANK` and `DENSE RANK`? 223 | 224 | Difference between `parquet` and `csv` file format? How are files written in a parquet file? 225 | 226 | What is Cursor command in SQL? 227 | 228 | Difference between Spark vs MapReduce architecture? 229 | 230 | Explanation of ETL pipeline 231 | 232 | Containerization v/s virtualization 233 | 234 | What is port redirection in docker? 235 | 236 | How to create a table with Databricks storage? 237 | 238 | Difference between SQL and NoSQL DB? 239 | 240 | A scenario where data keeps on changing, with adding and updating new features , would you consider SQL or NoSQL? 241 | 242 | What is the difference between iterators and generators 243 | 244 | What is the difference between OLAP and OLTP? 245 | 246 | **************************************************************************** 247 | -------------------------------------------------------------------------------- /dfs_bfs.py: -------------------------------------------------------------------------------- 1 | # Given a graph, there are two methods to 2 | # perform traversal on it. 3 | # 1. Depth First Search (DFS) 4 | # 2. Breadth First Search (BFS) 5 | 6 | # Breadth First Search: 7 | # We check the adjacent nodes first and then mark them visited to explore their adjacent nodes. 8 | # This uses a queue to keep track of the visited nodes in FIFO style 9 | # In BFS, one vertex is selected at a time when it is visited and marked then its adjacent are visited and stored in the queue 10 | # The goal is to get the shortest path by traversing the minimum no. of edges in the graph. 11 | # BFS tries all the possible path at the same time and then use a tie breaking strategy to decide the best path 12 | # It is used to search the solution in the nearest nodes 13 | # Hence it is useful for social networks where in-depth exploration is not requirewd 14 | 15 | # Depth First Search: 16 | # We check for exploration on the first single path we discovered and go deep until we encounter a dead end 17 | # This uses a stack to keep track of the visited node and perform `backtracking` in case a dead end is met 18 | # DFS is faster than BFS when exploration is the priority 19 | # DFS will always find a path but that may not be the shortest path, unlike BFS 20 | 21 | def dfs_1(graph, start): 22 | visited, stack = set(), [start] 23 | while stack: 24 | vertex = stack.pop() 25 | if vertex not in visited: 26 | visited.add(vertex) 27 | stack.extend(graph[vertex] - visited) 28 | return visited 29 | 30 | 31 | def dfs_2(graph, start, visited=None): 32 | if visited is None: 33 | visited = set() 34 | visited.add(start) 35 | for next in graph[start] - visited: 36 | dfs_2(graph, next, visited) 37 | return visited 38 | 39 | 40 | def bfs(graph, start): 41 | visited, queue = set(), [start] 42 | while queue: 43 | vertex = queue.pop(0) 44 | if vertex not in visited: 45 | visited.add(vertex) 46 | queue.extend(graph[vertex] - visited) 47 | return visited 48 | 49 | 50 | # bfs(graph, 'A') # {'B', 'C', 'A', 'F', 'D', 'E'} 51 | 52 | 53 | def dfs_paths(graph, start, goal): 54 | stack = [(start, [start])] 55 | while stack: 56 | (vertex, path) = stack.pop() 57 | for next in graph[vertex] - set(path): 58 | if next == goal: 59 | yield path + [next] 60 | else: 61 | stack.append((next, path + [next])) 62 | 63 | 64 | graph = { 65 | "A": set(["B", "C"]), 66 | "B": set(["A", "D", "E"]), 67 | "C": set(["A", "F"]), 68 | "D": set(["B"]), 69 | "E": set(["B", "F"]), 70 | "F": set(["C", "E"]), 71 | } 72 | 73 | result = list(dfs_paths(graph, "A", "F")) # [['A', 'C', 'F'], ['A', 'B', 'E', 'F']] 74 | print(result) 75 | -------------------------------------------------------------------------------- /diameterOfTree.py: -------------------------------------------------------------------------------- 1 | # The diameter of a tree (sometimes called the width) 2 | # is the number of nodes on the longest path between 3 | # two end nodes(leftmost leaf node and rightmost leaf node). 4 | 5 | # The diameter of a tree T is the largest of the following quantities: 6 | # the diameter of T’s left subtree 7 | # the diameter of T’s right subtree 8 | # the longest path between leaves that goes through the root of T (this can be computed from the heights of the subtrees of T) 9 | 10 | # Algorithm: 11 | # Case 1: if the diameter passes through origin, then diameter is 12 | # height of left subtree + height of right subtree + 1 (root node) 13 | # d = lheight + rheight + 1 14 | 15 | # Case 2: if the diameter is not passing through origin 16 | # Search for diameter in the left subtree and right subtree 17 | # Pick the larger value of the two subtrees 18 | # d = max(ldiameter, rdiameter) 19 | 20 | # Finally take max of the two values since we do not 21 | # know if diameter is passing through the root or not 22 | # d = max(lheight + rheight + 1, max(ldiameter, rdiameter)) 23 | 24 | class Node(object): 25 | def __init__(self, data): 26 | self.data = data 27 | self.left = None 28 | self.right = None 29 | 30 | def height(tree): 31 | if tree is None: 32 | return 0 33 | else: 34 | return 1 + max(height(tree.left), height(tree.right)) 35 | 36 | def diameter(tree): 37 | if tree is None: 38 | return 0 39 | 40 | else: 41 | lheight = height(tree.left) 42 | rheight = height(tree.right) 43 | 44 | ldiameter = diameter(tree.left) 45 | rdiameter = diameter(tree.right) 46 | 47 | return max(rheight + lheight + 1, max(ldiameter, rdiameter)) 48 | 49 | root = Node(1) 50 | root.left = Node(2) 51 | root.right = Node(3) 52 | root.left.left = Node(4) 53 | root.left.right = Node(5) 54 | print("Diameter of given binary tree is ",diameter(root)) 55 | -------------------------------------------------------------------------------- /estimate_pi.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from math import pi as PI 3 | 4 | def estimate_pi(sims): 5 | """ 6 | takes the number of simulations as input to estimate pi 7 | """ 8 | 9 | # counter to hold points lying inside the circle 10 | in_circle = 0 11 | 12 | for s in range(0,sims): 13 | 14 | x = np.random.rand() 15 | y = np.random.rand() 16 | 17 | if (x**2 + y**2) <= 1: 18 | in_circle += 1 19 | 20 | # The ratio of pts. inside the circle and the total pts. will be same as the ratio 21 | # of the area of circle to the area of the square, inside which the circle is inscribed 22 | # Area of circle = PI * R * R 23 | # Area of square = (2R) * (2R) 24 | 25 | pi_estimated = 4.0 * in_circle / sims 26 | 27 | print("Simulations ran: ", sims) 28 | print("Estimated pi", pi_estimated) 29 | print("Error", PI - pi_estimated) 30 | 31 | pow = 0 32 | input_sims = 100 33 | while pow <= 8: 34 | estimate_pi(sims=input_sims) 35 | pow += 1 36 | input_sims *= 10 37 | 38 | -------------------------------------------------------------------------------- /find_k_largest.py: -------------------------------------------------------------------------------- 1 | # Write an efficient program for 2 | # printing k largest elements in 3 | # an array. Elements in array can 4 | # be in any order. 5 | # Time Complexity: O(NlogN) + O(k) 6 | 7 | def findKLargest(arr, k): 8 | arr.sort(reverse=True) 9 | for i in range(0, k): 10 | print(arr[i], end=" ") 11 | 12 | arr = [1, 23, 12, 9, 30, 2, 50] 13 | k = 3 14 | findKLargest(arr, k) 15 | -------------------------------------------------------------------------------- /find_m_to_last_llist.py: -------------------------------------------------------------------------------- 1 | # Given a linked list, this method 2 | # will return m'th element to the last 3 | # 2->3->4->8->5; m=2 will return 8 4 | # since 8 is second to last 5 | 6 | from linked_list_data_structure import LinkedList 7 | 8 | 9 | def findMToLast(l_list, m): 10 | current = l_list.head 11 | count = 0 12 | 13 | while current is not None and count < m: 14 | count += 1 15 | current = current.getNextNode() 16 | 17 | m_behind = l_list.head 18 | while current.next_node is not None: 19 | current = current.getNextNode() 20 | m_behind = m_behind.getNextNode() 21 | 22 | return m_behind 23 | 24 | 25 | linked_list = LinkedList() 26 | m_to_last = 3 27 | # Returns the third element from last 28 | print(findMToLast(linked_list, m_to_last)) 29 | -------------------------------------------------------------------------------- /find_pairs_sum_k.py: -------------------------------------------------------------------------------- 1 | # Given an array of numbers, find all the 2 | # pairs of numbers which sum upto `k` 3 | 4 | 5 | def find_pairs(num_array, k): 6 | pairs_array = [] 7 | for num in num_array: 8 | if (k - num) in num_array: 9 | pairs_array.append((num, (k - num))) 10 | return pairs_array 11 | 12 | 13 | result = find_pairs([0, 14, 0, 4, 7, 8, 3, 5, 7], 11) 14 | print(result) 15 | -------------------------------------------------------------------------------- /find_products_pair_k.py: -------------------------------------------------------------------------------- 1 | def product_pair(arr, x): 2 | arr_sorted = sorted(arr) 3 | 4 | for i in range(0, len(arr_sorted)): 5 | sub_array = arr_sorted[i + 1 :] 6 | if x // arr_sorted[i] in sub_array: 7 | return True 8 | 9 | return False 10 | 11 | 12 | arr = [10, 20, 9, 40] 13 | x = 400 14 | 15 | res = product_pair(arr, x) 16 | print(res) 17 | -------------------------------------------------------------------------------- /find_pythagoras_triplet.py: -------------------------------------------------------------------------------- 1 | # Given an array of integers, write a function that 2 | # returns true if there is a triplet (a, b, c) that 3 | # satisfies a^2 + b^2 = c^2. 4 | 5 | def findPythagorasTriplet(arr, n): 6 | # convert the array to squares 7 | for i in range(0, n): 8 | arr[i] = arr[i] * arr[i] 9 | 10 | # sort the array 11 | arr.sort() 12 | 13 | # use meet in the middle to find the pair (a,b) 14 | for i in range(n-1, 1, -1): 15 | j = 0 16 | k = i-1 17 | 18 | while (j < k): 19 | # a pair is found 20 | if (arr[j] + arr[k]) == arr[i]: 21 | return True 22 | else: 23 | if (arr[j] + arr[k]) < arr[i]: 24 | j = j + 1 25 | else: 26 | k = k - 1 27 | 28 | return False 29 | 30 | ar = [3, 1, 4, 6, 5] 31 | ar_size = len(ar) 32 | if(findPythagorasTriplet(ar, ar_size)): 33 | print("Yes") 34 | else: 35 | print("No") 36 | -------------------------------------------------------------------------------- /find_second_largest_in_binary_tree.py: -------------------------------------------------------------------------------- 1 | # Given a binary tree, find the second largest 2 | # node in it 3 | 4 | 5 | class Node: 6 | def __init__(self, data): 7 | self.data = data 8 | self.left = None 9 | self.right = None 10 | 11 | 12 | def find_largest(root): 13 | current = root 14 | while current is not None: 15 | if current.right is not None: 16 | return current.right.data 17 | current = current.right 18 | 19 | 20 | def find_second_largest(root): 21 | if root is None or (root.left is None and root.right is None): 22 | raise ValueError("Tree must atleast have 2 nodes") 23 | 24 | current = root 25 | 26 | while current is not None: 27 | if current.left is not None and current.right is None: 28 | return find_largest(current.left) 29 | 30 | if ( 31 | current.right is not None 32 | and current.right.left is None 33 | and current.right.right is None 34 | ): 35 | return current.data 36 | 37 | current = current.right 38 | 39 | 40 | node = Node(10) 41 | node.left = Node(5) 42 | node.left.left = Node(1) 43 | node.right = Node(50) 44 | node.right.left = Node(45) 45 | node.right.right = Node(100) 46 | 47 | result = find_second_largest(node) 48 | print(result) # prints 50 49 | -------------------------------------------------------------------------------- /first_n_fibo.py: -------------------------------------------------------------------------------- 1 | # Write a function that computes the 2 | # list of the first 100 Fibonacci numbers 3 | 4 | FIB_ARR = [0, 1] 5 | 6 | def first_n_fibo(n): 7 | if n < 2: 8 | return FIB_ARR 9 | else: 10 | while len(FIB_ARR) < n: 11 | FIB_ARR.append(FIB_ARR[-1] + FIB_ARR[-2]) 12 | return FIB_ARR 13 | 14 | 15 | n = 10 16 | arr = first_n_fibo(n) 17 | print(arr) 18 | -------------------------------------------------------------------------------- /first_non_repeating.py: -------------------------------------------------------------------------------- 1 | # Given an input string, it gives the 2 | # first non repeating character in it 3 | # There are two implementations below 4 | # 1. has less space complexity 5 | # 2. has less time complexity 6 | 7 | 8 | def first_non_repeating(input_string): 9 | frequency = dict() 10 | flag = None 11 | 12 | for char in input_string: 13 | if char in frequency.keys(): 14 | frequency[char] += 1 15 | else: 16 | frequency[char] = 0 17 | 18 | for char in input_string: 19 | if frequency[char] == 0: 20 | flag = char 21 | break 22 | 23 | return flag 24 | 25 | 26 | # lesser time complexity 27 | # more space complexity 28 | # obvious space-time trade-off 29 | def first_non_repeating_v2(input_string): 30 | 31 | flag = None 32 | repeating = [] 33 | non_repeating = [] 34 | 35 | for char in input_string: 36 | if char in non_repeating: 37 | non_repeating.remove(char) 38 | repeating.append(char) 39 | else: 40 | non_repeating.append(char) 41 | 42 | if len(non_repeating) == 0: 43 | pass 44 | else: 45 | flag = non_repeating[0] 46 | 47 | return flag 48 | 49 | 50 | result = first_non_repeating("djebdedbekfrnkfnduwbdwkd") 51 | print(result) # j 52 | 53 | result = first_non_repeating("aabbcc") 54 | print(result) # None 55 | -------------------------------------------------------------------------------- /first_recurring_character.py: -------------------------------------------------------------------------------- 1 | # Given an input string, find the first 2 | # recurring character in it. 3 | 4 | 5 | def first_recurring_character(input): 6 | flag = None 7 | d = dict() 8 | for char in input: 9 | if char in d.keys(): 10 | flag = char 11 | return flag 12 | d[char] = 1 13 | return flag 14 | 15 | 16 | result = first_recurring_character("DFGHJWERGBFGHJ") 17 | print(result) # G 18 | result = first_recurring_character("ABCDEFGH") 19 | print(result) # None 20 | result = first_recurring_character("12345642124345") 21 | print(result) # 4 22 | -------------------------------------------------------------------------------- /first_unique_letter.py: -------------------------------------------------------------------------------- 1 | # Problem: Given a string, find the first non-repeating 2 | # character in it. For example, if the input string is 3 | # “GeeksforGeeks”, then output should be ‘f’ and if input 4 | # string is “GeeksQuiz”, then output should be ‘G’. 5 | 6 | import string 7 | 8 | letters = string.ascii_lowercase 9 | CHARACTER_HASH = dict(zip(letters, [0] * len(letters))) 10 | 11 | 12 | def mapLettersToHash(text_a): 13 | for char in text_a: 14 | if char in CHARACTER_HASH.keys(): 15 | CHARACTER_HASH[char] += 1 16 | 17 | 18 | def getFirstUniqueLetter(text_a): 19 | for char in text_a: 20 | if CHARACTER_HASH[char] == 1: 21 | return char 22 | 23 | 24 | text_1 = "geeksquiz" 25 | 26 | mapLettersToHash(text_1) 27 | result = getFirstUniqueLetter(text_1) 28 | print(result) 29 | -------------------------------------------------------------------------------- /gamblers_ruin.py: -------------------------------------------------------------------------------- 1 | """ 2 | David vs. Goliath Gambler's Ruin Simulation 3 | 4 | This program simulates a gambling scenario between two players: David and Goliath. 5 | David has a skill advantage, represented by a 55% probability of winning each round, 6 | while Goliath has a size advantage with a larger initial amount of money. 7 | 8 | Assumptions: 9 | - David starts with $2,000, and Goliath starts with $10,000. 10 | - Each round of betting results in a transfer of $1,000 from the loser to the winner. 11 | - The game continues until one player runs out of money (i.e., their amount reaches zero). 12 | - The outcome of each round is determined by a random number generator, reflecting David's skill advantage. 13 | 14 | Mathematics: 15 | - The simulation models a stochastic process where each round can be viewed as an independent Bernoulli trial: 16 | - David wins with a probability of 0.55. 17 | - Goliath wins with a probability of 0.45. 18 | - The expected outcomes can be analyzed using concepts from probability theory and stochastic processes. 19 | - The simulation runs for a specified number of trials to gather statistical data on how often David wins compared to Goliath. 20 | 21 | Usage: 22 | 1. Run the program in a Python environment. 23 | 2. Input the desired number of simulations when prompted. 24 | 3. The program will output the number of wins for both David and Goliath and display a bar chart of the results. 25 | 26 | This simulation provides insights into how skill can offset size advantages in competitive scenarios. 27 | """ 28 | 29 | import random 30 | import matplotlib.pyplot as plt 31 | 32 | def gambler_ruin(david_initial, goliath_initial, david_win_prob, simulations): 33 | results = [] 34 | 35 | for _ in range(simulations): 36 | david_amount = david_initial 37 | goliath_amount = goliath_initial 38 | 39 | while david_amount > 0 and goliath_amount > 0: 40 | # Simulate a single bet based on David's winning probability 41 | if random.random() < david_win_prob: # David wins 42 | david_amount += 1000 43 | goliath_amount -= 1000 44 | else: # Goliath wins 45 | david_amount -= 1000 46 | goliath_amount += 1000 47 | 48 | # Record the result: True if David wins, False if Goliath wins 49 | results.append(david_amount > 0) 50 | 51 | return results 52 | 53 | def plot_results(results): 54 | wins = sum(results) 55 | losses = len(results) - wins 56 | 57 | plt.bar(['David Wins', 'Goliath Wins'], [wins, losses], color=['blue', 'red']) 58 | plt.title('David vs. Goliath Simulation Results') 59 | plt.ylabel('Number of Simulations') 60 | plt.show() 61 | 62 | def main(): 63 | david_initial = 2000 # David's initial amount 64 | goliath_initial = 10000 # Goliath's initial amount 65 | david_win_prob = 0.51 # David's skill advantage (55%) 66 | simulations = int(input("Enter number of simulations: ")) 67 | 68 | results = gambler_ruin(david_initial, goliath_initial, david_win_prob, simulations) 69 | 70 | print(f"\nResults after {simulations} simulations:") 71 | print(f"David Wins: {sum(results)}") 72 | print(f"Goliath Wins: {len(results) - sum(results)}") 73 | 74 | plot_results(results) 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /gen_largest_num_frm_list.py: -------------------------------------------------------------------------------- 1 | # Write a function that given a list of non 2 | # negative integers, arranges them such that 3 | # they form the largest possible number. For 4 | # example, given [50, 2, 1, 9], the largest 5 | # formed number is 95021 6 | 7 | from itertools import permutations 8 | 9 | def generate_largest_number(arr): 10 | gen_nums = [] 11 | for i in permutations(arr, len(arr)): 12 | gen_nums.append("".join(map(str, i))) 13 | return max(gen_nums) 14 | 15 | arr = [54, 546, 548, 60] 16 | generate_largest_number(arr) 17 | -------------------------------------------------------------------------------- /general_tree_structure.py: -------------------------------------------------------------------------------- 1 | class Node: 2 | def __init__(self, value=None, children=[]): 3 | self.value = value 4 | self.children = children 5 | 6 | def getValue(self): 7 | return self.value 8 | 9 | def setValue(self, new_value): 10 | self.value = new_value 11 | 12 | def getNumChildren(self): 13 | return len(self.children) 14 | 15 | def getChild(self, index): 16 | return self.children[index] 17 | 18 | 19 | root = Node(5, [Node(2), Node(6)]) 20 | print(root.getValue()) # 5 21 | print(root.getNumChildren()) # 2 22 | print(root.getChild(1)) # 23 | -------------------------------------------------------------------------------- /getMinPlatforms.py: -------------------------------------------------------------------------------- 1 | # Given the arrival and departure times of all trains 2 | # that reach a railway station, the task is to find the 3 | # minimum number of platforms required for the railway 4 | # station so that no train waits. 5 | 6 | # We are given two arrays that represent the arrival and 7 | # departure times of trains that stop. 8 | 9 | def getMinPlatforms(arr, dep): 10 | if len(arr) != len(dep): 11 | print("Wrong inputs given...") 12 | return 13 | else: 14 | sorted_arr = sorted(arr + dep) 15 | 16 | minPlatform = 0 17 | trainsAtPlatform = 0 18 | 19 | for i in sorted_arr: 20 | if i in arr: 21 | trainsAtPlatform += 1 22 | if i in dep: 23 | trainsAtPlatform -= 1 24 | minPlatform = max(minPlatform, trainsAtPlatform) 25 | 26 | return minPlatform 27 | 28 | arrivalArr = [900, 940, 950, 1100, 1500, 1800] 29 | departureArr = [910, 1200, 1120, 1130, 1900, 2000] 30 | 31 | result = getMinPlatforms(arrivalArr, departureArr) 32 | 33 | print("Minimum no. of platforms for given time table are: ", result) 34 | -------------------------------------------------------------------------------- /get_dup_chars.py: -------------------------------------------------------------------------------- 1 | # Print duplicate characters in a string 2 | 3 | 4 | def get_dup_chars(input_str): 5 | dedupe_str = "" 6 | dup_chars = [] 7 | 8 | for char in input_str: 9 | if char not in dedupe_str: 10 | dedupe_str += char 11 | else: 12 | dup_chars.append(char) 13 | 14 | return dup_chars 15 | 16 | 17 | result = get_dup_chars("zmaxxazkgv") 18 | print(result) # ['x', 'a', 'z'] 19 | -------------------------------------------------------------------------------- /hasZeroSumSubArray.py: -------------------------------------------------------------------------------- 1 | # This method returns the sum of numbers 2 | # present till each index 3 | 4 | def getPrefixArray(arr): 5 | return [sum(arr[:i]) for i in range(1, len(arr)+1)] 6 | 7 | # This method will create a mapping of numbers 8 | # and the indices they are present at 9 | 10 | def getIndexMap(arr): 11 | indexMap = {} 12 | 13 | for i in range(len(arr)): 14 | if arr[i] not in indexMap.keys(): 15 | indexMap[arr[i]] = [i,] 16 | else: 17 | indexMap[arr[i]].append(i) 18 | 19 | return indexMap 20 | 21 | # This method will create the sum prefix of the 22 | # current array, if the sum is repeating, then 23 | # there is a zero sum sub array 24 | 25 | def hasZeroSum(arr): 26 | prefixArr = getPrefixArray(arr) 27 | sumAtIndexMap = getIndexMap(prefixArr) 28 | 29 | for v in sumAtIndexMap.values(): 30 | if len(v) > 1: 31 | return True 32 | break 33 | return False 34 | 35 | 36 | ipArray = [1, 4, -2, -2, 5, -4, 3] 37 | hasZeroSumTrue = hasZeroSum(ipArray) 38 | print(hasZeroSumTrue) 39 | -------------------------------------------------------------------------------- /has_only_digits.py: -------------------------------------------------------------------------------- 1 | # Given a string, check if it only contains digits. 2 | 3 | 4 | def is_digit(input_str): 5 | try: 6 | # If its possible to convert to a number, return True. 7 | int(input_str) 8 | return True 9 | except ValueError: 10 | # If the string contains letters, above will fail, returning False. 11 | return False 12 | 13 | 14 | result = is_digit("095357973590759530") 15 | print(result) # True 16 | 17 | result = is_digit("1234abc567") 18 | print(result) # False 19 | -------------------------------------------------------------------------------- /haversine.py: -------------------------------------------------------------------------------- 1 | from math import radians, cos, sin, asin, sqrt 2 | 3 | def haversine(lon1, lat1, lon2, lat2): 4 | """ 5 | Calculate the great circle distance between two points 6 | on the earth (specified in decimal degrees) 7 | """ 8 | # convert decimal degrees to radians 9 | lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) 10 | 11 | # haversine formula 12 | dlon = lon2 - lon1 13 | dlat = lat2 - lat1 14 | a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 15 | c = 2 * asin(sqrt(a)) 16 | r = 6371 # Radius of earth in kilometers. Use 3956 for miles 17 | return c * r 18 | -------------------------------------------------------------------------------- /heap_structure.py: -------------------------------------------------------------------------------- 1 | class Heap(object): 2 | 3 | HEAP_SIZE = 10 4 | 5 | def __init__(self): 6 | self.heap = [0] * Heap.HEAP_SIZE 7 | self.currentPosition = -1 8 | 9 | def insert(self, item): 10 | 11 | # if heap is full , we print a notification 12 | if self.isFull(): 13 | print("Heap is full") 14 | return 15 | # else, increment the currentPosition and add item 16 | self.currentPosition += 1 17 | self.heap[self.currentPosition] = item 18 | self.fixUp(self.currentPosition) 19 | 20 | def fixUp(self, index): 21 | parentIndex = int((index - 1) / 2) 22 | while parentIndex >= 0 and self.heap[parentIndex] < self.heap[index]: 23 | # if True swap heap[index] and heap[parentIndex] 24 | temp = self.heap[index] 25 | self.heap[index] = self.heap[parentIndex] 26 | self.heap[parentIndex] = temp 27 | # update the index and parentIndex 28 | index = parentIndex 29 | parentIndex = int((index - 1) / 2) 30 | 31 | def fixDown(self, index, upto): 32 | if upto < 0: 33 | upto = self.currentPosition 34 | 35 | while index <= upto: 36 | leftChild = 2 * index + 1 37 | rightChild = 2 * index + 2 38 | 39 | if leftChild <= upto: 40 | childToSwap = 0 41 | else: 42 | if self.heap[leftChild] < self.heap[rightChild]: 43 | childToSwap = leftChild 44 | else: 45 | childToSwap = rightChild 46 | 47 | if self.heap[index] < self.heap[childToSwap]: 48 | temp = self.heap[index] 49 | self.heap[index] = self.heap[childToSwap] 50 | self.heap[childToSwap] = temp 51 | else: 52 | break 53 | 54 | index = childToSwap 55 | 56 | else: 57 | return 58 | 59 | def heapSort(self): 60 | for i in range(0, self.currentPosition + 1): 61 | temp = self.heap[0] 62 | print("%d" % temp) 63 | self.heap[0] = self.heap[self.currentPosition - i] 64 | self.heap[self.currentPosition - i] = temp 65 | self.fixDown(0, self.currentPosition - i - 1) 66 | 67 | def getMax(self): 68 | result = self.heap[0] 69 | self.currentPosition -= 1 70 | self.heap[0] = self.heap[self.currentPosition] 71 | del self.heap[self.currentPosition] 72 | self.fixDown(0, -1) 73 | return result 74 | 75 | def isFull(self): 76 | return self.currentPosition == Heap.HEAP_SIZE 77 | 78 | 79 | some_heap = Heap() 80 | some_heap.insert(12) 81 | some_heap.insert(-3) 82 | some_heap.insert(21) 83 | some_heap.insert(7) 84 | some_heap.insert(4) 85 | some_heap.heapSort() 86 | -------------------------------------------------------------------------------- /hundred_without_int.py: -------------------------------------------------------------------------------- 1 | # Print numbers 1 to 100 without using any numbers or integers 2 | 3 | # APPRAOCH 4 | # Use Boolean values 5 | 6 | ONE = str(int(True)) 7 | ZERO = str(int(False)) 8 | HUNDRED = int(ONE + ZERO + ZERO) 9 | 10 | for i in range(int(ONE), HUNDRED+1): 11 | print(i, end=', ') 12 | 13 | # OUTPUT (Actual prints in the same line, line breaks given here for code clarity): 14 | 15 | # 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16 | # 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 17 | # 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 18 | # 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 19 | # 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 20 | # 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 21 | # 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 22 | # 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 23 | # 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 24 | # 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 25 | 26 | 27 | # ALTERNATE APPROACH 28 | # ORD (or "ordinal") function in python gives the ASCII 29 | # (American Standard Code for information interchange) value 30 | # of characters ranging from 0-256 in 8 bits of memory which 31 | # is equal to a byte. 32 | 33 | ONE = str(ord('b') - ord('a')) 34 | ZERO = str(ord('a') - ord('a')) 35 | HUNDRED = int(ONE + ZERO + ZERO) 36 | 37 | for i in range(int(ONE), HUNDRED+1): 38 | print(i, end=', ') 39 | -------------------------------------------------------------------------------- /interger_to_roman_num.py: -------------------------------------------------------------------------------- 1 | # Write a program to convert interger to Roman Representation 2 | def convertIntegerToRomanNum(input_num): 3 | 4 | number_list = [ 5 | 1000, 900, 500, 400, 6 | 100, 90, 50, 40, 7 | 10, 9, 5, 4, 8 | 1 9 | ] 10 | 11 | symbol_list = [ 12 | 'M', 'CM', 'D', 'CD', 13 | 'C', 'XC', 'L', 'XL', 14 | 'X', 'IX', 'V', 'IV', 15 | 'I' 16 | ] 17 | 18 | result_str = '' 19 | i = 0 20 | 21 | while input_num > 0: 22 | # if input_num//number_list[i] > 0 , then we will get a string appended 23 | # else it will continue the loop 24 | for _ in range(input_num//number_list[i]): 25 | result_str = result_str + symbol_list[i] 26 | input_num = input_num - number_list[i] 27 | i = i + 1 28 | return result_str 29 | 30 | roman = convertIntegerToRomanNum(289) 31 | print(roman) 32 | 33 | # =============== 34 | # Explaination: 35 | # =============== 36 | # 289 37 | # 289//1000 --> 0, i=1 38 | # 289//900 --> 0, i=2 39 | # 289//500 --> 0, i=3 40 | # 289//400 --> 0, i=4 41 | # 289//100 --> 2, then i=4, result_str = 'C', input_num = 189 42 | # 189//100 --> 1, then i=4, result_str='CC', input_num = 89 43 | # 89//100 --> 0, then i=5 44 | # 89//50 --> 1, then i=6, result_str='CCL', input_num= 39 45 | # 39//50 --> 0, then i=6 46 | # 39//40 --> 0, then i=7 47 | # 39//10 --> 3, then i=8, result_str='CCLX', input_num=29 48 | # 29//10 --> 3, then i=8, result_str='CCLXX', input_num=19 49 | # 19//10 --> 3, then i=8, result_str='CCLXXX', input_num=9 50 | # 9//10 --> 0, then i=9 51 | # 9//9 --> 1, then i=9, result_str='CCLXXXIX', input_num=0 52 | # EXIT 53 | -------------------------------------------------------------------------------- /intersection_arrays.py: -------------------------------------------------------------------------------- 1 | # Problem: Given two sorted array of sizes m and n 2 | # in which all elements are distinct. Find the 3 | # common elements between them 4 | # Constraints: in O(m+n) complexity. 5 | 6 | def inr(z): 7 | return z + 1 8 | 9 | def intersectionArrays(x, y, m, n): 10 | i, j = 0, 0 11 | intersect_arr = [] 12 | 13 | while i < m and j < n: 14 | # print(i, j) 15 | if x[i] == y[j]: 16 | intersect_arr.append(x[i]) 17 | i, j = inr(i), inr(j) 18 | elif x[i] < y[j]: 19 | i = inr(i) 20 | else: 21 | j = inr(j) 22 | print(intersect_arr) 23 | return 24 | 25 | list_a = [1, 2, 3, 4, 5] 26 | list_b = [2, 3, 5, 6] 27 | intersectionArrays(list_a, list_b, len(list_a), len(list_b)) 28 | -------------------------------------------------------------------------------- /isMatrixSymmetric.py: -------------------------------------------------------------------------------- 1 | def isMatrixSymmetric(mat, size): 2 | for i in range(size): 3 | for j in range(size): 4 | if mat[i][j] != mat[j][i]: 5 | break 6 | return False 7 | 8 | return True 9 | 10 | ipMat = [[1,2,3],[2,4,5],[3,5,8]] 11 | result = isMatrixSymmetric(ipMat, len(ipMat)) 12 | print(result) 13 | -------------------------------------------------------------------------------- /is_anagram.py: -------------------------------------------------------------------------------- 1 | # Check if two strings are anagrams of each other 2 | 3 | 4 | def anagramise(word): 5 | d = dict() 6 | 7 | for char in word: 8 | if char not in d.keys(): 9 | d[char] = 1 10 | else: 11 | d[char] += 1 12 | 13 | return d 14 | 15 | 16 | def is_anagram(str1, str2): 17 | return anagramise(str1) == anagramise(str2) 18 | 19 | 20 | result = is_anagram("hello", "billion") 21 | print(result) # False 22 | -------------------------------------------------------------------------------- /is_anagram_using_collections.py: -------------------------------------------------------------------------------- 1 | # Check if two strings are anagrams of each other 2 | 3 | 4 | from collections import Counter 5 | 6 | def is_anagram(str1, str2): 7 | return Counter(str1) == Counter(str2) 8 | 9 | 10 | result = is_anagram("hello", "billion") 11 | print(result) # False 12 | 13 | result = is_anagram("million", "million") 14 | print(result) # True 15 | -------------------------------------------------------------------------------- /is_num_palindrome.py: -------------------------------------------------------------------------------- 1 | # Write a program to check if a 2 | # number is a palindrome or not 3 | 4 | def is_num_palindrome(num): 5 | temp = str(num) 6 | i = 0 7 | j = len(temp) - 1 8 | 9 | while j > i: 10 | print(temp[i], temp[j]) 11 | if temp[i] == temp[j]: 12 | i = i + 1 13 | j = j - 1 14 | else: 15 | return False 16 | return True 17 | 18 | n = 3456543 19 | res = is_num_palindrome(n) 20 | print(res) 21 | -------------------------------------------------------------------------------- /is_numeric.py: -------------------------------------------------------------------------------- 1 | # Given a string, return True if it 2 | # is a numeric data type, False otherwise 3 | 4 | 5 | def is_numeric(input_str): 6 | 7 | data_types = [ 8 | int, 9 | float, 10 | complex, 11 | lambda T: int(T, 2), # binary 12 | lambda T: int(T, 8), # octal 13 | lambda T: int(T, 16), # hex 14 | ] 15 | 16 | for dtype in data_types: 17 | try: 18 | dtype(input_str) 19 | return True 20 | except ValueError: 21 | pass 22 | return False 23 | 24 | 25 | tests = [ 26 | "0", 27 | "0.", 28 | "00", 29 | "123", 30 | "0123", 31 | "+123", 32 | "-123", 33 | "-123.", 34 | "-123e-4", 35 | "-.8E-04", 36 | "0.123", 37 | "(5)", 38 | "-123+4.5j", 39 | "0b0101", 40 | " +0B101 ", 41 | "0o123", 42 | "-0xABC", 43 | "0x1a1", 44 | "12.5%", 45 | "1/2", 46 | "½", 47 | "3¼", 48 | "π", 49 | "Ⅻ", 50 | "1,000,000", 51 | "1 000", 52 | "- 001.20e+02", 53 | "NaN", 54 | "inf", 55 | "-Infinity", 56 | ] 57 | 58 | for s in tests: 59 | print(s, "---", is_numeric(s)) 60 | 61 | """ 62 | OUTPUT: 63 | 64 | 0 --- True 65 | 0. --- True 66 | 00 --- True 67 | 123 --- True 68 | 0123 --- True 69 | +123 --- True 70 | -123 --- True 71 | -123. --- True 72 | -123e-4 --- True 73 | -.8E-04 --- True 74 | 0.123 --- True 75 | (5) --- True 76 | -123+4.5j --- True 77 | 0b0101 --- True 78 | +0B101 --- True 79 | 0o123 --- True 80 | -0xABC --- True 81 | 0x1a1 --- True 82 | 12.5% --- False 83 | 1/2 --- False 84 | ½ --- False 85 | 3¼ --- False 86 | π --- False 87 | Ⅻ --- False 88 | 1,000,000 --- False 89 | 1 000 --- False 90 | - 001.20e+02 --- False 91 | NaN --- True 92 | inf --- True 93 | -Infinity --- True 94 | """ 95 | -------------------------------------------------------------------------------- /josephus.py: -------------------------------------------------------------------------------- 1 | # Problem: N soldiers are standing in a circle and 2 | # first person has sword and he kills the 2nd person 3 | # and gives the sword to the third person and so on 4 | # till 99th person kills the 100th person gives the 5 | # sword back to the first person, this goes on till 6 | # only one person survives. Print the survivor. 7 | 8 | 9 | def josephus(people, step=2): 10 | if step <= 1: 11 | print("Enter step value, greater than 1") 12 | else: 13 | step -= 1 # translated to zero-based indexing 14 | kill = step # kill will hold the index of current person to die 15 | while len(people) > 1: 16 | print(people.pop(kill)) # pop method removes the element from the list 17 | kill = (kill + step) % len(people) 18 | print(people[0], "is safe") 19 | 20 | 21 | num = int(input("Enter the number of soldiers: ")) 22 | soldiers = [i for i in range(1, num + 1)] # generates a list of 1..num 23 | josephus(soldiers) 24 | -------------------------------------------------------------------------------- /josephus_improved.py: -------------------------------------------------------------------------------- 1 | # The effective time complexity of the 2 | # improved version is O(logN). For the 3 | # problem statement, refer `josephus.py` 4 | 5 | 6 | def josephus_v2(people, step=2): 7 | if step <= 1: 8 | print("Enter step value, greater than 1") 9 | else: 10 | # len() method has O(1) time 11 | N = len(people) # caching the size of array 12 | p = 1 13 | # the loop runs for O(floor(logN)) time 14 | while p * 2 < N: 15 | p = p * 2 16 | # If N is a power of 2, should return 1. So let's check if L (N-p) is < p 17 | if N - p >= p: 18 | print(1) 19 | else: 20 | print((2 * (N - p)) + 1) 21 | 22 | 23 | num = int(input("Enter the number of soldiers: ")) 24 | soldiers = [i for i in range(1, num + 1)] # generates a list of 1..num 25 | josephus_v2(soldiers) 26 | -------------------------------------------------------------------------------- /josephus_improved_v3.py: -------------------------------------------------------------------------------- 1 | # The effective time complexity of this improved version is O(1). 2 | # For the problem statement, refer `josephus.py` 3 | 4 | 5 | def josephus_v3(soldiers): 6 | # Convert to binary. 7 | binary = bin(soldiers) 8 | # Get the first digit and put it as last. 9 | shift = "0b" + binary[3::] + binary[2:3:] 10 | # Convert to decimal. 11 | return int(shift, 2) 12 | 13 | 14 | winning = josephus_v3(soldiers=41) 15 | print(winning) # Winning Soldier: 19 16 | 17 | winning = josephus_v3(soldiers=100) 18 | print(winning) # Winning Soldier: 73 19 | 20 | winning = josephus_v3(soldiers=1000) 21 | print(winning) # Winning Soldier: 977 22 | 23 | # Testing: 24 | test_josephus = { 25 | 1: 1, 26 | 2: 1, 27 | 3: 3, 28 | 4: 1, 29 | 5: 3, 30 | 6: 5, 31 | 7: 7, 32 | 8: 1, 33 | 9: 3, 34 | 10: 5, 35 | 11: 7, 36 | 12: 9, 37 | 13: 11, 38 | 16: 1, 39 | 41: 19, 40 | } 41 | for soldiers, expected_winner in test_josephus.items(): 42 | assert josephus_v3(soldiers=soldiers) == expected_winner 43 | -------------------------------------------------------------------------------- /karatsuba.py: -------------------------------------------------------------------------------- 1 | import random 2 | from math import ceil 3 | from math import log10 4 | 5 | 6 | def get_digits(n): 7 | if n > 0: 8 | digits = int(log10(n)) + 1 9 | elif n == 0: 10 | digits = 1 11 | else: 12 | digits = int(log10(-n)) + 2 13 | return digits 14 | 15 | 16 | def karatsuba(x, y): 17 | # the base case for recursion 18 | if x < 10 and y < 10: 19 | return x * y 20 | 21 | # n is the number of digits in the highest input number 22 | n = max(get_digits(x), get_digits(y)) 23 | 24 | n_2 = int(ceil(n / 2.0)) 25 | n = n if n % 2 == 0 else n + 1 26 | 27 | # split the input numbers 28 | a, b = divmod(x, 10 ** n_2) 29 | c, d = divmod(y, 10 ** n_2) 30 | 31 | # applying the recursive steps 32 | ac = karatsuba(a, c) 33 | bd = karatsuba(b, d) 34 | ad_bc = karatsuba((a + b), (c + d)) - ac - bd 35 | 36 | # performs the multiplication 37 | z2 = (10 ** n) * ac 38 | z1 = (10 ** n_2) * ad_bc 39 | z0 = bd 40 | return z2 + z1 + z0 41 | 42 | 43 | def test(): 44 | for i in range(1000): 45 | x = random.randint(1, 10 ** 5) 46 | y = random.randint(1, 10 ** 5) 47 | expected = x * y 48 | result = karatsuba(x, y) 49 | if result != expected: 50 | return print("failed") 51 | return print("ok") 52 | 53 | 54 | if __name__ == "__main__": 55 | test() 56 | -------------------------------------------------------------------------------- /level_order_tree.py: -------------------------------------------------------------------------------- 1 | # This is a level order traversal of a 2 | # a binary tree. It is also known as breadth 3 | # first traversal. 4 | 5 | 6 | class Node: 7 | def __init__(self, data): 8 | self.data = data 9 | self.left = None 10 | self.right = None 11 | 12 | 13 | def printLevelOrder(root): 14 | height = getHeight(root) 15 | if root is None: 16 | return 17 | else: 18 | # iteratively print all the levels 19 | for i in range(1, height + 1): 20 | printGivenLevel(root, i) 21 | 22 | 23 | def printGivenLevel(root, level): 24 | if root is None: 25 | return 26 | if level == 1: 27 | print("%d" % root.data) 28 | elif level > 1: 29 | # recursively print the given level 30 | printGivenLevel(root.left, level - 1) 31 | printGivenLevel(root.right, level - 1) 32 | 33 | 34 | def getHeight(root): 35 | if root is None: 36 | return 0 37 | else: 38 | lheight = getHeight(root.left) 39 | rheight = getHeight(root.right) 40 | 41 | if lheight > rheight: 42 | return lheight + 1 43 | else: 44 | return rheight + 1 45 | 46 | 47 | root = Node(1) 48 | root.left = Node(2) 49 | root.right = Node(3) 50 | root.left.left = Node(4) 51 | root.left.right = Node(5) 52 | 53 | print("Level order traversal of binary tree is -") 54 | printLevelOrder(root) 55 | -------------------------------------------------------------------------------- /linked_list_data_structure.py: -------------------------------------------------------------------------------- 1 | class Node: 2 | def __init__(self, data=None): 3 | self.data = data 4 | self.next_node = None 5 | 6 | 7 | class LinkedList: 8 | def __init__(self, head=None): 9 | self.head = head 10 | 11 | def isEmpty(self): 12 | return self.head == None 13 | 14 | def insert(self, data): 15 | # create a temp node 16 | temp = Node(data=data) 17 | # point new node to head 18 | temp.next_node = self.head 19 | # set the head as new node 20 | self.head = temp 21 | 22 | def insert_after(self, prev, data): 23 | if prev is None: 24 | raise ValueError("Given node is not found...") 25 | return prev 26 | 27 | # create a temp node 28 | temp = Node(data=data) 29 | # set next node of temp to the next node of previous 30 | temp.next_node = prev.next_node 31 | # set next node of previous to point temp 32 | prev.next_node = temp 33 | 34 | def size(self): 35 | # start with the head 36 | current = self.head 37 | count = 0 38 | 39 | # loop unless current is not None 40 | while current: 41 | count += 1 42 | current = current.next_node 43 | return count 44 | 45 | def search(self, data): 46 | # start with the head 47 | current = self.head 48 | found = False 49 | 50 | # loop unless current is not None 51 | while current and not found: 52 | # if found, change flag and return data 53 | if current.data == data: 54 | found = True 55 | else: 56 | # change current to next node 57 | current = current.next_node 58 | if current is None: 59 | # raise Exception if not found 60 | raise ValueError("Data is not in the list") 61 | return current 62 | 63 | def delete(self, data): 64 | # start with the head 65 | current = self.head 66 | previous = None 67 | found = False 68 | 69 | # loop unless current is not None 70 | while current and not found: 71 | # if found, change flag 72 | if current.getData() == data: 73 | found = True 74 | else: 75 | previous = current 76 | current = current.next_node 77 | 78 | if current is None: 79 | # raise Exception if not found 80 | raise ValueError("Data is not in the list") 81 | if previous is None: 82 | self.head = current.next_node 83 | else: 84 | previous.next_node = current.next_node 85 | -------------------------------------------------------------------------------- /loop_in_linkedlist.py: -------------------------------------------------------------------------------- 1 | from linked_list_data_structure import LinkedList 2 | from find_m_to_last_llist import findMToLast 3 | 4 | 5 | def hasLoop(l_list): 6 | fast = l_list.head.next_node 7 | slow = l_list.head 8 | hasLoop = False 9 | 10 | while True: 11 | if fast or fast.next_node: 12 | pass 13 | elif fast == slow or fast.next_node == slow: 14 | hasLoop = True 15 | else: 16 | fast = fast.next_node 17 | slow = slow.next_node 18 | 19 | return hasLoop 20 | 21 | 22 | linked_list = LinkedList() 23 | # Returns the third element from last 24 | print(findMToLast(linked_list)) 25 | -------------------------------------------------------------------------------- /lowest_common_ancestor.py: -------------------------------------------------------------------------------- 1 | def findLowestCommonAncestor(root, value1, value2): 2 | while root is not None: 3 | value = root.value 4 | if value > value1 and value > value2: 5 | root = root.left 6 | elif value < value1 and value < value2: 7 | root = root.right 8 | else: 9 | return root 10 | -------------------------------------------------------------------------------- /majority_element.py: -------------------------------------------------------------------------------- 1 | # Problem: A majority element in an array A[] of 2 | # size n is an element that appears more than n/2 3 | # times. Find the majority element in the given array. 4 | 5 | # Returns the elements whose frequency is more than n/2 6 | def findMajorityElement(elements, N, found=False): 7 | keys = [int(i) for i in elements.keys()] 8 | for i in keys: 9 | if elements[i] > N // 2: 10 | found = True 11 | print(i) 12 | if not found: 13 | print("Majority element not found") 14 | 15 | 16 | # Creates a hash of frequency of numbers 17 | def mapFrequency(arr): 18 | FREQUENCY = {} 19 | for i in arr: 20 | if i in FREQUENCY.keys(): 21 | FREQUENCY[i] += 1 22 | else: 23 | FREQUENCY[i] = 0 24 | return findMajorityElement(FREQUENCY, len(arr)) 25 | 26 | 27 | arr = [1, 2, 4, 4, 4, 4, 4] 28 | mapFrequency(arr) 29 | -------------------------------------------------------------------------------- /max_in_array.py: -------------------------------------------------------------------------------- 1 | def maxIndex(arr): 2 | max_index = 0 3 | for i in range(1, len(arr)): 4 | if arr[max_index] < arr[i]: 5 | max_index = i 6 | return max_index 7 | 8 | arr = [4,5,6,7,8,1,2,11,12,13,3,9,10] 9 | res = maxIndex(arr) 10 | print("maximum element is", arr[res],"at index:", res) 11 | -------------------------------------------------------------------------------- /maximum_subarray_sum.py: -------------------------------------------------------------------------------- 1 | # Problem: Given a list of positive and negative numbers, find the maximum subarray sum. 2 | # Constraint: Solve it in O(n) 3 | 4 | # Solution: Use two variables to hold sums 5 | # a. overall sum -- initialize to first element 6 | # b. partial sum -- initialize to first element 7 | # Traverse over the whole array 8 | # If the current element is greater than partial sum, swap the partial sum with the current element 9 | # If the partial sum is greater than overall sum, swap overall with partial sum 10 | # the overall sum will be the contiguous subarray with the largest sum 11 | 12 | def maxSubArraySum(arr): 13 | max_so_far = arr[0] 14 | current_max = arr[0] 15 | 16 | for i in range(1,len(arr)): 17 | current_max = max(arr[i], current_max + arr[i]) 18 | max_so_far = max(max_so_far, current_max) 19 | 20 | return max_so_far 21 | 22 | sampleArr = [-2, -3, 4, -1, -2, 1, 5, -3] 23 | 24 | solution = maxSubArraySum(sampleArr) 25 | print(solution) 26 | -------------------------------------------------------------------------------- /merge_sort.py: -------------------------------------------------------------------------------- 1 | def merge_sort(array): 2 | # derive the mid-point 3 | if len(array) > 1: 4 | mid = len(array) // 2 5 | 6 | # create the temp sub-arrays 7 | LEFT = array[:mid] 8 | RIGHT = array[mid:] 9 | 10 | # sort the first and second halves 11 | merge_sort(LEFT) 12 | merge_sort(RIGHT) 13 | 14 | # begin addig elements in sorted order 15 | i, j, k = 0, 0, 1 16 | 17 | while i < len(LEFT) and j < len(RIGHT): 18 | if LEFT[i] < RIGHT[j]: 19 | array[k] = LEFT[i] 20 | i += 1 21 | else: 22 | array[k] = RIGHT[j] 23 | j += 1 24 | k += 1 25 | 26 | # copy the remaining data 27 | while i < len(LEFT): 28 | array[k] = LEFT[i] 29 | i += 1 30 | k += 1 31 | 32 | while j < len(RIGHT): 33 | array[k] = RIGHT[j] 34 | j += 1 35 | k += 1 36 | 37 | 38 | arr = [6, 5, 3, 1, 8, 7, 2, 4] 39 | merge_sort(arr) 40 | print(arr) 41 | -------------------------------------------------------------------------------- /min_max_array_oneLoop.py: -------------------------------------------------------------------------------- 1 | def maxMinIndex(arr): 2 | 3 | max_index = 0 4 | min_index = 0 5 | 6 | # using two counters in single for loop 7 | for i, j in zip(range(1, len(arr)), range(1, len(arr))): 8 | if arr[max_index] < arr[i]: 9 | max_index = i 10 | if arr[min_index] > arr[i]: 11 | min_index = i 12 | 13 | return max_index, min_index 14 | 15 | arr = [4,5,6,7,8,1,2,11,12,13,3,9,10] 16 | res = maxMinIndex(arr) 17 | print("maximum element is", arr[res[0]],"at index:", res[0]) 18 | print("minimum element is", arr[res[1]],"at index:", res[1]) 19 | -------------------------------------------------------------------------------- /move_zeros_to_end.py: -------------------------------------------------------------------------------- 1 | # Given an array of integers we need to move 2 | # all the zeroes to the end and maintain the 3 | # order of rest of the elements. Needless to 4 | # say it should be an in-place solution 5 | 6 | def move_zero_to_end(arr): 7 | count = 0 8 | 9 | for i in arr: 10 | if i != 0: 11 | arr[count] = i 12 | count = count + 1 13 | 14 | for i in range(count, len(arr)): 15 | arr[i] = 0 16 | 17 | return arr 18 | 19 | 20 | 21 | 22 | array = [1, 9, 8, 4, 0, 0, 2, 7, 0, 6, 0, 9] 23 | res = move_zero_to_end(array) 24 | print(res) 25 | -------------------------------------------------------------------------------- /no_sibling_tree.py: -------------------------------------------------------------------------------- 1 | # Problem: Print the nodes of a binary tree 2 | # which do not have a sibling 3 | 4 | 5 | class Node: 6 | def __init__(self, data): 7 | self.data = data 8 | self.left = None 9 | self.right = None 10 | 11 | 12 | def printSingleNode(root, hasSibling): 13 | # hasSibling will check if root has both children 14 | if root is None: 15 | return 16 | else: 17 | # if root has one child, print that child data 18 | if not hasSibling: 19 | print("%d" % root.data) 20 | 21 | printSingleNode(root.left, root.right is not None) 22 | printSingleNode(root.right, root.left is not None) 23 | 24 | 25 | root = Node(1) 26 | root.left = Node(2) 27 | root.right = Node(3) 28 | root.left.left = Node(4) 29 | root.right.left = Node(5) 30 | root.right.left.left = Node(6) 31 | 32 | print("Level order traversal of binary tree is -") 33 | printSingleNode(root, True) 34 | -------------------------------------------------------------------------------- /oddAscEvenDesc.py: -------------------------------------------------------------------------------- 1 | # Let all odd numbers come before even numbers, 2 | # and sort the odd numbers in ascending order and 3 | # even numbers in descending order. 4 | # For example, the string '1982376455' becomes '1355798642' 5 | 6 | def oddAscEvenDesc(inputStr): 7 | oddSubstr = '' 8 | evenSubstr = '' 9 | 10 | for char in inputStr: 11 | if int(char) % 2 == 0: 12 | evenSubstr += char 13 | else: 14 | oddSubstr += char 15 | 16 | temp = sorted(oddSubstr) + sorted(evenSubstr, reverse=True) 17 | 18 | return "".join(temp) 19 | 20 | x = "978231456" 21 | y = oddAscEvenDesc(x) 22 | print(y) 23 | -------------------------------------------------------------------------------- /pascal_triangle.py: -------------------------------------------------------------------------------- 1 | # Recusrive method to create the series 2 | def computePascal(col, row): 3 | # There are three things to compute 4 | # 1. Left edge: col is 0 5 | # 2. Right edge: col is same as row 6 | if col == row or col == 0: 7 | return 1 8 | # 3. any other cell: col-1 + col of the previous row 9 | else: 10 | return computePascal(col - 1, row - 1) + computePascal(col, row - 1) 11 | 12 | 13 | # Method to create the triangle for `N` row 14 | def printTriangle(num): 15 | for r in range(num): 16 | # upon observation, we can deduce the relation 17 | # num_cols = num_rows + 1 18 | for c in range(r + 1): 19 | print(str(computePascal(c, r)), end=" ") 20 | print("\n") 21 | 22 | 23 | printTriangle(10) 24 | """ 25 | Output: 26 | 1 27 | 28 | 1 1 29 | 30 | 1 2 1 31 | 32 | 1 3 3 1 33 | 34 | 1 4 6 4 1 35 | 36 | 1 5 10 10 5 1 37 | 38 | 1 6 15 20 15 6 1 39 | 40 | 1 7 21 35 35 21 7 1 41 | 42 | 1 8 28 56 70 56 28 8 1 43 | 44 | 1 9 36 84 126 126 84 36 9 1 45 | """ 46 | -------------------------------------------------------------------------------- /pascals_triangle_improved.py: -------------------------------------------------------------------------------- 1 | # using factorial, reduced the time complexity 2 | # of program from O(2^N) to O(N) 3 | 4 | 5 | def factorial(n): 6 | if n < 2: 7 | return 1 8 | else: 9 | return n * factorial(n - 1) 10 | 11 | 12 | def computeCoefficient(col, row): 13 | return factorial(row) // (factorial(col) * factorial(row - col)) 14 | 15 | 16 | # Recusrive method to create the series 17 | def computePascal(col, row): 18 | if col == row or col == 0: 19 | return 1 20 | else: 21 | return computeCoefficient(col, row) 22 | 23 | 24 | # Method to create the triangle for `N` row 25 | def printTriangle(num): 26 | for r in range(num): 27 | for c in range(r + 1): 28 | print(str(computePascal(c, r)), end=" ") 29 | print("\n") 30 | 31 | 32 | printTriangle(10) 33 | """ 34 | Output: 35 | 1 36 | 37 | 1 1 38 | 39 | 1 2 1 40 | 41 | 1 3 3 1 42 | 43 | 1 4 6 4 1 44 | 45 | 1 5 10 10 5 1 46 | 47 | 1 6 15 20 15 6 1 48 | 49 | 1 7 21 35 35 21 7 1 50 | 51 | 1 8 28 56 70 56 28 8 1 52 | 53 | 1 9 36 84 126 126 84 36 9 1 54 | """ 55 | -------------------------------------------------------------------------------- /permutations.py: -------------------------------------------------------------------------------- 1 | def permutations(word): 2 | if len(word) == 1: 3 | return [word] 4 | else: 5 | result = [] 6 | for p in permutations(word[1:]): 7 | print(p, word[1:]) 8 | print("\n") 9 | for i in range(len(word)): 10 | print(i, "1" + p[:i], "2" + word[0:1], "3" + p[i:]) 11 | current_p = p[:i] + word[0:1] + p[i:] 12 | result.append(current_p) 13 | 14 | return result 15 | 16 | 17 | given_input = "bc" 18 | print(permutations(given_input)) 19 | -------------------------------------------------------------------------------- /permute_strings.py: -------------------------------------------------------------------------------- 1 | def permute(s, result): 2 | if len(s) == 0: 3 | print(result, end=" ") 4 | return 5 | 6 | for i in range(len(s)): 7 | char = s[i] 8 | left_str = s[0: i] 9 | right_str = s[i+1: ] 10 | 11 | other_str = left_str + right_str 12 | permute(other_str, result + char) 13 | 14 | permute("naruto", "") 15 | 16 | -------------------------------------------------------------------------------- /preorder_iterative_bst.py: -------------------------------------------------------------------------------- 1 | def preOrderTraversal(root): 2 | stack = [] 3 | stack.insert(0, root) 4 | while len(stack) > 0: 5 | current = stack.pop() 6 | print(current.value) 7 | 8 | right = current.right 9 | if right is not None: 10 | stack.insert(0, right) 11 | 12 | left = current.left 13 | if left is not None: 14 | stack.insert(0, left) 15 | -------------------------------------------------------------------------------- /priority_queue_simple.py: -------------------------------------------------------------------------------- 1 | # Priority Queue is an extension of the queue with following properties. 2 | # 1) An element with high priority is dequeued before an element with low priority. 3 | # 2) If two elements have the same priority, they are served according to their order in the queue 4 | # The delete operation has time complexity of O(n) 5 | 6 | class PriorityQueue(object): 7 | def __init__(self): 8 | self.queue = [] 9 | 10 | def __str__(self): 11 | return ' '.join([str(i) for i in self.queue]) 12 | 13 | # for checking if the queue is empty 14 | def isEmpty(self): 15 | return len(self.queue) == 0 16 | 17 | # for inserting an element in the queue 18 | def insert(self, data): 19 | self.queue.append(data) 20 | 21 | # for popping an element based on Priority 22 | def delete(self): 23 | try: 24 | max = 0 25 | 26 | for i in range(len(self.queue)): 27 | if self.queue[max] < self.queue[i]: 28 | max = i 29 | 30 | item = self.queue[max] 31 | del self.queue[max] 32 | 33 | return item 34 | 35 | except IndexError: 36 | print() 37 | sys.exit() 38 | 39 | myQueue = PriorityQueue() 40 | myQueue.insert(12) 41 | myQueue.insert(1) 42 | myQueue.insert(14) 43 | myQueue.insert(7) 44 | 45 | print(myQueue) 46 | 47 | while not myQueue.isEmpty(): 48 | print(myQueue.delete()) 49 | 50 | -------------------------------------------------------------------------------- /processStringToDict.py: -------------------------------------------------------------------------------- 1 | # Process the string "k:1 |k1:2|k2:3|k3:4" into a dictionary {k:1,k1:2,...} 2 | 3 | StringToProcess = "k:1 |k1:2|k2:3|k3:4" 4 | 5 | d2 = dict() 6 | keyvalue_list = StringToProcess.split('|') # ['k:1' , 'k1:2' , 'k2:3' ,'k3:4'] 7 | 8 | for keyval in keyvalue_list: 9 | k,v = keyval.split(':') # (k,1) , (k1,2) , (k2,3) ,(k3,4) 10 | d2[k] = v 11 | 12 | 13 | print(d2) # {'k': '1 ', 'k1': '2', 'k2': '3', 'k3': '4'} 14 | -------------------------------------------------------------------------------- /product_puzzle.py: -------------------------------------------------------------------------------- 1 | # Problem: Given an array arr[] of n integers, 2 | # construct a Product Array prod[] (of same size) 3 | # such that prod[i] is equal to the product of all 4 | # the elements of arr[] except arr[i]. 5 | 6 | # Constraints: Solve it without division operator and in O(n) 7 | 8 | 9 | def computeProductArray(array, size): 10 | # initialize three arrays of the same size as given array 11 | # Left: will hold the product of all the elements to the left 12 | # Right: will hold the product of all the elements to the right 13 | # Product: contains the product value for current element 14 | 15 | prod = [1] * size 16 | left = [1] * size 17 | right = [1] * size 18 | 19 | for i in range(1, size): 20 | left[i] = array[i - 1] * left[i - 1] 21 | # decreasing loop in Python {start, end, step(-ve)} 22 | # equivalent to (j=size-2; j>=0; j--) 23 | for j in range(size - 2, -1, -1): 24 | right[j] = array[j + 1] * right[j + 1] 25 | for k in range(0, size): 26 | prod[k] = left[k] * right[k] 27 | print(prod) 28 | 29 | 30 | arr = [10, 3, 5, 6, 2] 31 | computeProductArray(arr, len(arr)) 32 | # Result: [180, 600, 360, 300, 900] 33 | -------------------------------------------------------------------------------- /queue_data_structure.py: -------------------------------------------------------------------------------- 1 | class Queue: 2 | def __init__(self): 3 | self.queue = [] 4 | 5 | def enqueue(self, value): 6 | self.queue.insert(0, value) 7 | 8 | def dequeue(self): 9 | self.queue.pop() 10 | 11 | def isEmpty(self): 12 | return self.size() == 0 13 | 14 | def size(self): 15 | return len(self.queue) 16 | -------------------------------------------------------------------------------- /quick_sort.py: -------------------------------------------------------------------------------- 1 | def quick_sort(arr): 2 | quick_sort_helper(arr, 0, len(arr) - 1) 3 | 4 | 5 | def quick_sort_helper(arr, first, last): 6 | if first < last: 7 | pi = partition(arr, first, last) 8 | 9 | quick_sort_helper(arr, first, pi - 1) 10 | quick_sort_helper(arr, pi + 1, last) 11 | 12 | 13 | def partition(arr, first, last): 14 | pivot = arr[first] 15 | 16 | left = first + 1 17 | right = last 18 | 19 | done = False 20 | while not done: 21 | while left <= right and arr[left] <= pivot: 22 | left += 1 23 | 24 | while arr[right] >= pivot and right >= left: 25 | right -= 1 26 | 27 | if right < left: 28 | done = True 29 | else: 30 | arr[left], arr[right] = arr[right], arr[left] 31 | 32 | arr[first], arr[right] = arr[right], arr[first] 33 | 34 | return right 35 | 36 | 37 | alist = [54, 26, 93, 17, 77, 31, 44, 55, 20] 38 | quick_sort(alist) 39 | print(alist) # [17, 20, 26, 31, 44, 54, 55, 77, 93] 40 | -------------------------------------------------------------------------------- /range_fn_float.py: -------------------------------------------------------------------------------- 1 | # Make a range function that works for `float` inputs 2 | 3 | def float_for(start, stop, increment, stop_inclusive=True): 4 | if stop_inclusive: 5 | stop += increment 6 | 7 | while start < stop: 8 | # The yield statement returns a `generator` object to 9 | # the one who calls the function which contains yield, 10 | # instead of simply returning a value. 11 | yield start 12 | start += increment 13 | 14 | 15 | for i in float_for(0.5, 0.95, 0.05): 16 | print(i) 17 | 18 | """ 19 | Output: 20 | 21 | 0.5 22 | 0.55 23 | 0.6000000000000001 24 | 0.6500000000000001 25 | 0.7000000000000002 26 | 0.7500000000000002 27 | 0.8000000000000003 28 | 0.8500000000000003 29 | 0.9000000000000004 30 | 0.9500000000000004 31 | """ 32 | -------------------------------------------------------------------------------- /remove_chars.py: -------------------------------------------------------------------------------- 1 | # Write an efficient function that deletes characters from an ASCII 2 | # string where any character existing in remove must be deleted from 3 | # str. For example, given a str of "Battle of the Vowels: Hawaii vs. 4 | # Grozny" and a remove of "aeiou", the function should transform str 5 | # to “Bttl f th Vwls: Hw vs. Grzny”. 6 | 7 | 8 | def removeChars(main_string, remove_string): 9 | result = "" 10 | for char in main_string: 11 | if char not in remove_string: 12 | result += char 13 | return result 14 | 15 | 16 | given_input = "Battle of the Vowels: Hawaii vs. Grozny" 17 | vowels = "aeiou" 18 | print(removeChars(given_input, vowels)) 19 | -------------------------------------------------------------------------------- /remove_dup_chars.py: -------------------------------------------------------------------------------- 1 | # Remove duplicate characters from string 2 | 3 | 4 | def remove_dup_chars(input_str): 5 | dedupe_str = "" 6 | 7 | for char in input_str: 8 | if char not in dedupe_str: 9 | dedupe_str += char 10 | 11 | return dedupe_str 12 | 13 | 14 | result = remove_dup_chars("zmaxxazkgv") 15 | print(result) # zmaxxazkgv 16 | -------------------------------------------------------------------------------- /remove_duplicates.py: -------------------------------------------------------------------------------- 1 | def remove_duplicates(arr): 2 | return list(dict.fromkeys(arr)) 3 | 4 | 5 | result = remove_duplicates([0, 0, 0, 1, 1, 2, 2, 3, 4, 5]) 6 | print(result) # [0, 1, 2, 3, 4, 5] 7 | -------------------------------------------------------------------------------- /remove_duplicates_v2.py: -------------------------------------------------------------------------------- 1 | def remove_duplicates_v2(arr): 2 | dedupe_arr = [] 3 | 4 | for i in arr: 5 | if i not in dedupe_arr: 6 | dedupe_arr.append(i) 7 | 8 | return dedupe_arr 9 | 10 | 11 | result = remove_duplicates_v2([0, 0, 0, 1, 1, 2, 2, 3, 4, 5]) 12 | print(result) 13 | -------------------------------------------------------------------------------- /reverse_in_place.py: -------------------------------------------------------------------------------- 1 | def reverse_in_place(arr): 2 | i = 0 3 | j = len(arr) - 1 4 | 5 | while i != j and i < j: 6 | arr[i], arr[j] = arr[j], arr[i] 7 | i += 1 8 | j -= 1 9 | 10 | return arr 11 | 12 | 13 | result = reverse_in_place([4, 12, 14, 16, 18]) 14 | print(result) 15 | -------------------------------------------------------------------------------- /reverse_str_recursive.py: -------------------------------------------------------------------------------- 1 | # Given a string, get it reversed using recursion 2 | 3 | 4 | def recursive_reverse(input_str): 5 | if len(input_str) == 0: 6 | return "" 7 | else: 8 | return recursive_reverse(input_str[1:]) + input_str[0] 9 | 10 | 11 | result = recursive_reverse("aabbcc") 12 | print(result) # ccbbaa 13 | -------------------------------------------------------------------------------- /reverse_words.py: -------------------------------------------------------------------------------- 1 | def reverseWords(sentence): 2 | stack = [] 3 | words = sentence.split() 4 | for word in words: 5 | stack.insert(0, word) 6 | return " ".join(word for word in stack) 7 | 8 | 9 | given_input = "Do or do not, there is no try." 10 | print(reverseWords(given_input)) 11 | -------------------------------------------------------------------------------- /rotateMatrix180Deg.py: -------------------------------------------------------------------------------- 1 | def rotateMatrixby90(ipMat, size): 2 | opMat = [[0 for i in range(size)] for j in range(size)] 3 | 4 | for i in range(size): 5 | for j in range(size): 6 | opMat[j][i] = ipMat[i][j] 7 | 8 | return opMat 9 | 10 | def reverseMatrix(ipMat, size): 11 | opMat = [[0 for i in range(size)] for j in range(size)] 12 | for i in range(size): 13 | for j in range(size): 14 | opMat[abs(i-(size-1))][j] = ipMat[i][j] 15 | 16 | return opMat 17 | 18 | def rotateMatrixby180(ipMat, size): 19 | mat_1 = rotateMatrixby90(ipMat, size) 20 | mat_2 = reverseMatrix(mat_1, len(mat_1)) 21 | mat_3 = rotateMatrixby90(mat_2, len(mat_2)) 22 | mat_4 = reverseMatrix(mat_3, len(mat_3)) 23 | 24 | return mat_4 25 | 26 | def printMatrix(ipMat, size): 27 | for i in range(size): 28 | for j in range(size): 29 | print(ipMat[i][j], end=" ") 30 | print('\n') 31 | 32 | matA = [[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]] 33 | print("Original-Matrix" + '\n') 34 | printMatrix(matA, len(matA)) 35 | 36 | print("Rotated-Matrix" + '\n') 37 | rotatedMat = rotateMatrixby90(matA, len(matA)) 38 | printMatrix(rotatedMat, len(rotatedMat)) 39 | 40 | matB = [[1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15], [4, 8, 12, 16]] 41 | reverseMat = reverseMatrix(matB, len(matB)) 42 | print("Reverse-Matrix" + '\n') 43 | printMatrix(reverseMat, len(reverseMat)) 44 | 45 | print("Rotated-180-Matrix" + '\n') 46 | rotatedMat180 = rotateMatrixby180(matA, len(matA)) 47 | printMatrix(rotatedMat180, len(rotatedMat180)) 48 | -------------------------------------------------------------------------------- /rotate_matrix.py: -------------------------------------------------------------------------------- 1 | # Problem: Rotate a square matrix by 2 | # 90 degree with O(1) extra space 3 | def rotate_by_90(m): 4 | # unpacking arguments with zip(*) in reverse with [ : :-1] 5 | tuples = zip(*m[::-1]) 6 | # flattening tuples to list with [list(i)] 7 | return [list(i) for i in tuples] 8 | 9 | 10 | def makeMatrix(array, size): 11 | # validating size of matrix for given array 12 | if size ** 2 != len(array): 13 | return -1 14 | # make sub array of length size using array slicing 15 | else: 16 | matrix = [array[i : i + size] for i in range(0, len(array), size)] 17 | return rotate_by_90(matrix) 18 | 19 | 20 | arr = [1, 2, 3, 4] 21 | dimension = 2 22 | result = makeMatrix(arr, dimension) 23 | # Original Matrix: [[1, 2], [3, 4]] 24 | # Result: [[3, 1], [4, 2]] 25 | -------------------------------------------------------------------------------- /running_median_integers.py: -------------------------------------------------------------------------------- 1 | # Problem: Find Median from Data Stream 2 | 3 | def findMedian(stream): 4 | # print(stream) 5 | streamSize = len(stream) 6 | 7 | if streamSize == 1: 8 | return stream[0] 9 | else: 10 | stream = sorted(stream) 11 | midPt = streamSize // 2 12 | 13 | if streamSize % 2 == 1: 14 | return stream[midPt] 15 | else: 16 | return (stream[midPt] + stream[midPt-1]) // 2 17 | 18 | def findRunningMedian(inputArray): 19 | medianArray = [] 20 | 21 | for i in range(0,len(inputArray)): 22 | # print(inputArray) 23 | currentMedian = findMedian(inputArray[0:i+1]) 24 | medianArray.append(currentMedian) 25 | 26 | return medianArray 27 | 28 | 29 | solution = findRunningMedian([1, 2, 3, 4, 5]) 30 | print(solution) 31 | -------------------------------------------------------------------------------- /search_unique.py: -------------------------------------------------------------------------------- 1 | # Problem: Given a sorted array in which all elements 2 | # appear twice (one after one) and one element 3 | # appears only once. Find that element 4 | # Constraints: in O(log n) complexity. 5 | def searchUnique(arr, low, high): 6 | # Base Cases 7 | # 1. low is greater than high 8 | # 2. Array with single element 9 | if low > high: 10 | return None 11 | if low == high: 12 | return arr[low] 13 | # Find the middle element 14 | mid = low + (high - low) // 2 15 | # if the middle element lies at even place, 16 | # check i and i+1, if they are same, go to right else left 17 | if mid % 2 == 0: 18 | if arr[mid] == arr[mid + 1]: 19 | return searchUnique(arr, mid + 2, high) 20 | else: 21 | return searchUnique(arr, low, mid) 22 | # if the middle element lies at odd place, 23 | # check i-1 and i, if they are same, go to right else left 24 | # Replace mid by mid-1 for above block 25 | else: 26 | if arr[mid - 1] == arr[mid]: 27 | return searchUnique(arr, mid + 1, high) 28 | else: 29 | return searchUnique(arr, low, mid - 1) 30 | 31 | 32 | array = [1, 1, 2, 4, 4, 5, 5, 6, 6] 33 | result = searchUnique(array, 0, len(array) - 1) 34 | print(result) 35 | # Result: 2 36 | -------------------------------------------------------------------------------- /selection_sort.py: -------------------------------------------------------------------------------- 1 | # A simple implementation of Selection Sort 2 | 3 | def selectionSort(arr): 4 | i = 0 5 | while i < len(arr): 6 | min_index = i 7 | for j in range(i+1, len(arr)): 8 | if arr[j] < arr[min_index]: 9 | min_index = j 10 | 11 | arr[i], arr[min_index] = arr[min_index], arr[i] 12 | i = i + 1 13 | 14 | return arr 15 | 16 | arr = [2, 6, 1, 5, 3, 4] 17 | res = selectionSort(arr) 18 | print(res) 19 | 20 | -------------------------------------------------------------------------------- /signOfProduct.py: -------------------------------------------------------------------------------- 1 | # Problem: Given an array arr[] of n integers, 2 | # the integers can be positive, negative or 0 3 | # return the sign of the product of the elements 4 | # 1 : positive 5 | # -1 : negative 6 | # 0 : zero 7 | 8 | 9 | def getSignOfProduct(array): 10 | 11 | sign = 1 12 | 13 | for num in array: 14 | 15 | if num == 0: 16 | return 0 17 | 18 | if num < 0: 19 | sign = -1 * sign 20 | 21 | return sign 22 | 23 | 24 | arr = [10, 45, -9, 3, -4, -5, 7, 32 , 0, 12 , 45, -1] 25 | res = getSignOfProduct(arr) 26 | print(arr, res) 27 | 28 | # Result: [10, 45, -9, 3, -4, -5, 7, 32, 0, 12, 45, -1] 0 29 | -------------------------------------------------------------------------------- /stack_data_structure.py: -------------------------------------------------------------------------------- 1 | class Stack: 2 | def __init__(self): 3 | self.stack = [] 4 | 5 | def push(self, value): 6 | self.stack.append(value) 7 | 8 | def pop(self): 9 | if self.isEmpty(): 10 | print("Stack underflow...") 11 | return None 12 | else: 13 | self.stack.pop() 14 | 15 | def size(self): 16 | return len(self.stack) 17 | 18 | def isEmpty(self): 19 | return self.size() == 0 20 | 21 | def peek(self): 22 | if self.isEmpty(): 23 | return None 24 | else: 25 | return self.stack[-1] 26 | -------------------------------------------------------------------------------- /stock_span.py: -------------------------------------------------------------------------------- 1 | # Context : The span Si of the stock’s price on a given day i 2 | # is defined as the maximum number of consecutive days just before 3 | # the given day, for which the price of the stock on the current 4 | # day is less than or equal to its price on the given day. 5 | 6 | # Problem: We have a series of n daily price quotes for a stock 7 | # and we need to calculate span of stock’s price for all n days 8 | 9 | 10 | def calculate_span(stock_quotes, span): 11 | # span for the first quote will always be 1 12 | span[0] = 1 13 | for i in range(1, len(stock_quotes), 1): 14 | # initialize span value to be 1 for each ith quote 15 | span[i] = 1 16 | # scan for all the quotes to the left 17 | j = i - 1 18 | # if the preceeding quote has a value less than or equal to current quote 19 | # increase the span value of the current quote 20 | while j >= 0 and stock_quotes[i] >= stock_quotes[j]: 21 | span[i] = span[i] + 1 22 | j = j - 1 23 | return span 24 | 25 | 26 | quotes = [10, 4, 5, 90, 120, 80] 27 | # initialize span as an empty list with same length as quotes 28 | span_list = [None] * len(quotes) 29 | print(calculate_span(quotes, span_list)) 30 | # Result : [1, 1, 2, 4, 5, 1] 31 | -------------------------------------------------------------------------------- /sum_array_recursion.py: -------------------------------------------------------------------------------- 1 | # Write a program to sum a given array 2 | # using recursion 3 | 4 | def recur_sum(arr, i): 5 | #print(arr[i]) 6 | if i < 1: 7 | return arr[i] 8 | else: 9 | return arr[i] + recur_sum(arr, i-1) 10 | 11 | 12 | arr = [-1, 2, -3, 4, 5] 13 | print(recur_sum(arr, len(arr)-1)) 14 | -------------------------------------------------------------------------------- /timeseries.py: -------------------------------------------------------------------------------- 1 | # Libraries Included: 2 | # Numpy, Scipy, Scikit, Pandas 3 | 4 | import pandas as pd 5 | #print("Hello, world!") 6 | 7 | def get_merged(series1, series2): 8 | merged = [] 9 | temp = {} 10 | 11 | for i in series1: 12 | if i[0] not in temp.keys(): 13 | temp[i[0]] = i[1] 14 | else: 15 | curr = temp.get(i[0]) 16 | temp[i[0]] = (curr + i[1]) / 2 17 | 18 | for i in series2: 19 | if i[0] not in temp.keys(): 20 | temp[i[0]] = i[1] 21 | else: 22 | curr = temp.get(i[0]) 23 | temp[i[0]] = (curr + i[1]) / 2 24 | 25 | #print(temp) 26 | ordered = sorted(temp.items(), key=temp.items[1], reverse=False) 27 | #print(ordered) 28 | 29 | #for k,v in ordered.items(): 30 | # merged.append((k,v)) 31 | 32 | return ordered 33 | 34 | # timeseries data 35 | series1 = [ 36 | ('2010-01-01', 34), 37 | ('2010-01-02', 27), 38 | ('2010-01-04', 58), 39 | ('2010-01-05', 22)] 40 | 41 | series2 = [ 42 | ('2010-01-01', 15), 43 | ('2010-01-03', 39), 44 | ('2010-01-05', 23), 45 | ('2010-01-06', 47)] 46 | 47 | res = get_merged(series1, series2) 48 | print(res) 49 | 50 | # merged = [ 51 | # ('2010-01-01', 34), 52 | # ('2010-01-02', 27), 53 | # ('2010-01-03', 39), 54 | # ('2010-01-04', 58), 55 | # ('2010-01-05', 22.5), 56 | # ('2010-01-06', 47)] 57 | 58 | # input array of series 59 | # subfunc(inp_ser, temp) -> updated_temp 60 | # keep track of series encounteered 61 | 62 | 63 | # freq_dict -> (date, counter) 64 | 65 | for i in series1: 66 | if i[0] not in temp.keys(): 67 | temp[i[0]] = i[1] 68 | freq_dict[i[0]] = 1 69 | else: 70 | curr = temp.get(i[0]) 71 | freq_dict[i[0]] += 1 72 | temp[i[0]] = (curr + i[1]) / freq_dict.get(i[0]) 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /union_arrays.py: -------------------------------------------------------------------------------- 1 | # Problem: Given two sorted array of sizes m and n 2 | # in which all elements are distinct. Find the 3 | # union between them 4 | # Constraints: in O(m+n) complexity. 5 | 6 | def unionArrays(x, y, m, n): 7 | union_arr = [] 8 | 9 | for i in range(m): 10 | union_arr.append(x[i]) 11 | 12 | for j in range(n): 13 | if y[j] not in union_arr: 14 | union_arr.append(y[j]) 15 | 16 | print(union_arr) 17 | return 18 | 19 | list_a = [1, 2, 3, 4, 5] 20 | list_b = [2, 3, 5, 6] 21 | unionArrays(list_a, list_b, len(list_a), len(list_b)) 22 | -------------------------------------------------------------------------------- /username_validation.py: -------------------------------------------------------------------------------- 1 | # Have the function UsernameValidation(`str`) take the `str` parameter being passed and determine if the string is a valid username according to the following rules: 2 | 3 | # 1. The username is between 4 and 25 characters. 4 | # 2. It must start with a letter. 5 | # 3. It can only contain letters, numbers, and the underscore character. 6 | # 4. It cannot end with an underscore character. 7 | 8 | #If the username is valid then your program should return the string `true`, otherwise return the string `false`. 9 | 10 | def UsernameValidation(strParam): 11 | 12 | # username is between 4 and 25 characters 13 | if len(strParam) < 4 or len(strParam) > 25 : 14 | return False 15 | 16 | # start with a letter 17 | if not str(strParam[0]).isalpha(): 18 | return False; 19 | 20 | # can't end with an underscore 21 | if str(strParam[-1] ) == '_': 22 | return False; 23 | 24 | # contains only letters, numbers and underscore 25 | valid_grammar = set('abcdefghijklmnopqrstuvwxyz0123456789_') 26 | 27 | for ch in strParam: 28 | if ch.lower() not in valid_grammar: 29 | return False; 30 | 31 | return True 32 | 33 | # keep this function call here 34 | TC1 = "aa_" 35 | TC2 = "uaa__hello_worldW" 36 | 37 | print(TC1, UsernameValidation(TC1)) 38 | print(TC2, UsernameValidation(TC2)) 39 | --------------------------------------------------------------------------------