├── .gitignore
├── CODE_Of_CONDUCT.md
├── CONTRIBUTING.md
├── DSA-master
└── Day-1
│ ├── pascalTraingleRecursive.py
│ └── setMatrixZero.py
├── DataScience
├── GeneralMLPrep.md
├── LLMPrep.md
├── MlAlgoCheatSheet.md
├── MlAlgoKeyFormulae.md
└── study_plan.md
├── GenerativeAI
├── 1.1.What does generative truly mean.md
├── 1.2 Next Word Prediction.md
├── 1.3 Embedding Process. - Mathematical Intution.md
├── 1.4 Attention Block - Python Example.md
├── 1.5 MLP Block - Python Example.md
├── 1.6 Positional Encoding - Python Example.md
├── 1.7 End to End process of Attention.md
├── 2. How does FAISS work.md
├── 3. FAISS Advanced explaination.md
├── 4. Transformers and Vector DB Interview prep.md
├── 5. FAISS interview prep.md
├── 6.RecursiveReferenceRAG.md
└── References.md
├── Interview_Questions.md
├── LICENSE
├── Python_Programming_Quiz.md
├── README.md
├── adjacentElementProduct.py
├── atoi.py
├── binary_search_recursive.py
├── bits_wilp
├── Ex2_Numpy_Q1.ipynb
├── Ex2_Numpy_Q2.ipynb
├── Ex2_Numpy_Q3.ipynb
├── Ex2_Numpy_Q4.ipynb
├── Ex2_Numpy_Q5.ipynb
├── Ex2_Numpy_Q6.ipynb
├── Ex2_Pandas_DataViz.ipynb
├── Quiz 1_ S2-20_DSECLPFDS.pdf
├── binomialCoefficient.py
├── calculateFrequency.py
├── isAngstrom.py
├── isPalindrome.py
├── practice.py
├── primeFactorization.py
├── sample.txt
├── searching.py
├── sumOfDigits.py
└── topThreeFrequent.py
├── bresenham_line_algorithm.py
├── bst_nodes_in_range.py
├── bubble_sort.py
├── calculateClockAngle.py
├── check_anagrams.py
├── check_semiprime.py
├── data_science_interviews.md
├── dfs_bfs.py
├── diameterOfTree.py
├── estimate_pi.py
├── find_k_largest.py
├── find_m_to_last_llist.py
├── find_pairs_sum_k.py
├── find_products_pair_k.py
├── find_pythagoras_triplet.py
├── find_second_largest_in_binary_tree.py
├── first_n_fibo.py
├── first_non_repeating.py
├── first_recurring_character.py
├── first_unique_letter.py
├── gamblers_ruin.py
├── gen_largest_num_frm_list.py
├── general_tree_structure.py
├── getMinPlatforms.py
├── get_dup_chars.py
├── hasZeroSumSubArray.py
├── has_only_digits.py
├── haversine.py
├── heap_structure.py
├── hundred_without_int.py
├── interger_to_roman_num.py
├── intersection_arrays.py
├── isMatrixSymmetric.py
├── is_anagram.py
├── is_anagram_using_collections.py
├── is_num_palindrome.py
├── is_numeric.py
├── josephus.py
├── josephus_improved.py
├── josephus_improved_v3.py
├── karatsuba.py
├── level_order_tree.py
├── linked_list_data_structure.py
├── loop_in_linkedlist.py
├── lowest_common_ancestor.py
├── majority_element.py
├── max_in_array.py
├── maximum_subarray_sum.py
├── merge_sort.py
├── min_max_array_oneLoop.py
├── move_zeros_to_end.py
├── no_sibling_tree.py
├── oddAscEvenDesc.py
├── pascal_triangle.py
├── pascals_triangle_improved.py
├── permutations.py
├── permute_strings.py
├── preorder_iterative_bst.py
├── priority_queue_simple.py
├── processStringToDict.py
├── product_puzzle.py
├── queue_data_structure.py
├── quick_sort.py
├── range_fn_float.py
├── remove_chars.py
├── remove_dup_chars.py
├── remove_duplicates.py
├── remove_duplicates_v2.py
├── reverse_in_place.py
├── reverse_str_recursive.py
├── reverse_words.py
├── rotateMatrix180Deg.py
├── rotate_matrix.py
├── running_median_integers.py
├── search_unique.py
├── selection_sort.py
├── signOfProduct.py
├── stack_data_structure.py
├── stock_span.py
├── sum_array_recursion.py
├── timeseries.py
├── union_arrays.py
└── username_validation.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/CODE_Of_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at singhal.amogh1995@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | When contributing to this repository, please first discuss the change you wish to make via issue,
4 | email, or any other method with the owners of this repository before making a change.
5 |
6 | Please note we have a code of conduct, please follow it in all your interactions with the project.
7 |
8 | ## Pull Request Process
9 |
10 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a
11 | build.
12 | 2. Update the README.md with details of changes to the interface, this includes new environment
13 | variables, exposed ports, useful file locations and container parameters.
14 | 3. Increase the version numbers in any examples files and the README.md to the new version that this
15 | Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/).
16 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you
17 | do not have permission to do that, you may request the second reviewer to merge it for you.
18 |
19 | ## Code of Conduct
20 |
21 | ### Our Pledge
22 |
23 | In the interest of fostering an open and welcoming environment, we as
24 | contributors and maintainers pledge to making participation in our project and
25 | our community a harassment-free experience for everyone, regardless of age, body
26 | size, disability, ethnicity, gender identity and expression, level of experience,
27 | nationality, personal appearance, race, religion, or sexual identity and
28 | orientation.
29 |
30 | ### Our Standards
31 |
32 | Examples of behavior that contributes to creating a positive environment
33 | include:
34 |
35 | * Using welcoming and inclusive language
36 | * Being respectful of differing viewpoints and experiences
37 | * Gracefully accepting constructive criticism
38 | * Focusing on what is best for the community
39 | * Showing empathy towards other community members
40 |
41 | Examples of unacceptable behavior by participants include:
42 |
43 | * The use of sexualized language or imagery and unwelcome sexual attention or
44 | advances
45 | * Trolling, insulting/derogatory comments, and personal or political attacks
46 | * Public or private harassment
47 | * Publishing others' private information, such as a physical or electronic
48 | address, without explicit permission
49 | * Other conduct which could reasonably be considered inappropriate in a
50 | professional setting
51 |
52 | ### Our Responsibilities
53 |
54 | Project maintainers are responsible for clarifying the standards of acceptable
55 | behavior and are expected to take appropriate and fair corrective action in
56 | response to any instances of unacceptable behavior.
57 |
58 | Project maintainers have the right and responsibility to remove, edit, or
59 | reject comments, commits, code, wiki edits, issues, and other contributions
60 | that are not aligned to this Code of Conduct, or to ban temporarily or
61 | permanently any contributor for other behaviors that they deem inappropriate,
62 | threatening, offensive, or harmful.
63 |
64 | ### Scope
65 |
66 | This Code of Conduct applies both within project spaces and in public spaces
67 | when an individual is representing the project or its community. Examples of
68 | representing a project or community include using an official project e-mail
69 | address, posting via an official social media account, or acting as an appointed
70 | representative at an online or offline event. Representation of a project may be
71 | further defined and clarified by project maintainers.
72 |
73 | ### Enforcement
74 |
75 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
76 | reported by contacting the project team at [INSERT EMAIL ADDRESS]. All
77 | complaints will be reviewed and investigated and will result in a response that
78 | is deemed necessary and appropriate to the circumstances. The project team is
79 | obligated to maintain confidentiality with regard to the reporter of an incident.
80 | Further details of specific enforcement policies may be posted separately.
81 |
82 | Project maintainers who do not follow or enforce the Code of Conduct in good
83 | faith may face temporary or permanent repercussions as determined by other
84 | members of the project's leadership.
85 |
86 | ### Attribution
87 |
88 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
89 | available at [http://contributor-covenant.org/version/1/4][version]
90 |
91 | [homepage]: http://contributor-covenant.org
92 | [version]: http://contributor-covenant.org/version/1/4/
93 |
--------------------------------------------------------------------------------
/DSA-master/Day-1/pascalTraingleRecursive.py:
--------------------------------------------------------------------------------
1 | def computeCoeff(row, col):
2 | """
3 | This method computes the Binomial coefficient for each point in the Pascal Triangle
4 | """
5 | if col == 0 or row == col:
6 | return 1 # for the corners of each row
7 | else:
8 | return computeCoeff(row-1, col) + computeCoeff(row-1, col-1) # take the numbers in previous row same column and one column left of that number
9 |
10 | def printTriangle(n):
11 | """
12 | This method prints the Pascal triangle with `n` rows
13 | """
14 | for r in range(n):
15 | for c in range(r+1):
16 | print(computeCoeff(r,c), end=' ')
17 | print('\n')
18 |
19 | printTriangle(10)
20 |
21 | # Output
22 | """
23 | 1
24 |
25 | 1 1
26 |
27 | 1 2 1
28 |
29 | 1 3 3 1
30 |
31 | 1 4 6 4 1
32 |
33 | 1 5 10 10 5 1
34 |
35 | 1 6 15 20 15 6 1
36 |
37 | 1 7 21 35 35 21 7 1
38 |
39 | 1 8 28 56 70 56 28 8 1
40 |
41 | 1 9 36 84 126 126 84 36 9 1
42 | """
43 |
--------------------------------------------------------------------------------
/DSA-master/Day-1/setMatrixZero.py:
--------------------------------------------------------------------------------
1 | # Set Matrix Zero
2 | # Problem Statement: Given a matrix if an element in the matrix is 0
3 | # then you will have to set its entire column and row to 0 and then
4 | # return the matrix.
5 |
6 | # Input: matrix=[[1,1,1],[1,0,1],[1,1,1]]
7 | # Output: [[1,0,1],[0,0,0],[1,0,1]]
8 |
9 | def getZeros(matrix, shape):
10 | """
11 | returns the location of zeros in the matrix
12 | as a list of tuples
13 | Time Complexity: O(M*N) where MxN is the shape of matrix
14 | """
15 | r,c = shape
16 | zeros = []
17 | for i in range(0,r):
18 | for j in range(0,c):
19 | if matrix[i][j] == 0:
20 | zeros.append((i,j))
21 | return zeros
22 |
23 | def setZeros(matrix, shape, zeros):
24 | """
25 | returns the modified matrix
26 | Time complexity: O(M+N) where MxN is the shape of the matrix
27 | """
28 | r,c = shape
29 |
30 | for z in zeros:
31 | m,n = z
32 | for i in range(0,r):
33 | matrix[i][n] = 0
34 |
35 | for j in range(0,c):
36 | matrix[m][j] = 0
37 |
38 | return matrix
39 |
40 |
41 |
42 | M = [[0,1,2,0],[3,4,5,2],[1,3,1,5]]
43 | size_mat = (len(M), len(M[0]))
44 |
45 | print("Original Matrix: ", M)
46 | print("Shape of the matrix: ",size_mat)
47 |
48 | zeroLocs = getZeros(M, size_mat)
49 | print("Zeros found at:", zeroLocs)
50 |
51 | M_z = setZeros(M, size_mat, zeroLocs)
52 | print("Modified matrix is: ", M_z)
53 |
--------------------------------------------------------------------------------
/DataScience/GeneralMLPrep.md:
--------------------------------------------------------------------------------
1 | CNN
2 | ==========
3 | * CNN are deep learning architectures that are primarily used for processing image data.
4 | * The special operation known as Convolution helps them extract features like edges and textures, in combination with filters.
5 | * ReLU is applied as a activation function to add non linearity
6 | * Pooling is perfomed to reduce the spatial dimensions while retainign important information. This is helpful in computational load and controlling overfitting
7 | * Fully Connected Layer (FCL) , after several convolution and pooling operations, the output is passes through a FCL to generate class probabilties needed for classification.
8 |
9 | How CNNs Work:
10 | ==========
11 | * The input image is transformed into a numerical representation, where each pixel is assigned a value based on its intensity.
12 | * The convolution operation involves sliding the filter across the image and performing element-wise multiplication, followed by summation to create a feature map.
13 | * As data progresses through multiple layers, CNNs learn increasingly complex features, from simple edges in early layers to intricate shapes in deeper layers.
14 |
15 | Applications:
16 | ==========
17 | CNNs are widely used in various fields such as:
18 | * Image Recognition: Identifying objects in images (e.g., facial recognition).
19 | * Medical Image Analysis: Analyzing X-rays or MRIs for diagnostic purposes.
20 | * Autonomous Vehicles: Object detection and scene understanding.
21 |
22 | RNN
23 | ==========
24 | * RNN are a class of nerual networks that are excellent at processing sequential data
25 | * They maintain an internal state at time step `t` for input `x(t)`, and combines it with hidden state from the previous step `h(t-1)` to produce a new hidden state `h(t)`
26 | * h (t) = f [ W(h) * h(t−1) + W(x) * x(t) + b], W(h) and W(x) are weight matrics and b is the bias term, f is the activation function
27 |
28 | Applications:
29 | ==========
30 | RNNs are commonly used in:
31 | * Natural Language Processing: Tasks such as language modeling, text generation, and sentiment analysis.
32 | * Speech Recognition: Processing audio signals to convert speech into text.
33 | * Time Series Prediction: Forecasting stock prices or weather conditions based on historical data.
34 |
35 | Decision Tree
36 | ==========
37 | * Decision tree is a supervised ML algorithm used in classification and regression taks
38 | * It is able to model decision and possible consequences in the form of a tree like strcuture
39 | * The branch represents a `decision rule` and the internal node represents a `feature`. The leaf node or the terminal node of the branch is the `outcome`
40 |
41 | Building a Decision Tree:
42 | ==========
43 | DEINR (pronounced as "Diner") : Data; Entropy; InformationGain ; NodeSeletion; RecursiveSplitting
44 | * Data Input: Start with the entire dataset.
45 | * Entropy Calculation: Calculate the entropy of the target variable and predictor attributes to measure impurity.
46 | * Information Gain: Determine the information gain for each attribute to identify which feature best splits the data.
47 | * Node Selection: Choose the attribute with the highest information gain as the root node.
48 | * Recursive Splitting: Repeat this process recursively for each branch until all branches are finalized or a *stopping criterion is met (e.g., maximum depth or minimum samples per leaf)*
49 |
50 | Advantages:
51 | ==========
52 | * Easy to interpret and visualize.
53 | * Requires little data preprocessing (no need for normalization).
54 | * Can handle both numerical and categorical data.
55 |
56 | Disadvantages:
57 | ============
58 | * Prone to overfitting, especially with deep trees.
59 | * Sensitive to small variations in data.
60 |
61 | Random Forest
62 | ==========
63 | * Random Forest is an ensermble technique, that combines multiple decision trees
64 | * It mitigates overfitting by averaging the results of many tree, which indivudually may have high variance
65 |
66 | Building a Random Forest:
67 | ==========
68 | BTA (pronounced as "beta"): BootStrapSampling; TreeConstruction; Aggregation
69 | * Bootstrap Sampling: Randomly select subsets of the training data with replacement to create multiple datasets.
70 | * Tree Construction: For each subset, build a decision tree using a random selection of features at each split.
71 | * Aggregation: During prediction, aggregate the results from all trees (e.g., majority vote for classification or average for regression)
72 |
73 | Advantages:
74 | ==========
75 | * Reduces overfitting compared to individual decision trees.
76 | * Handles large datasets with higher dimensionality well.
77 | * Provides feature importance scores.
78 |
79 | Disadvantages:
80 | ==========
81 | * More complex and less interpretable than single decision trees.
82 | * Requires more computational resources.
83 |
84 | Bagging or (B)ootstrap (Agg)regating
85 | ====================================
86 | * This is an ensemble technique aimed at improving the accuracy and stability of ML models
87 | * It is done by combining multiple models trained on different subsets of the training data
88 |
89 | How Bagging Works:
90 | ===============
91 | * Multiple Samples: Generate multiple bootstrap samples from the original dataset.
92 | * Model Training: Train a separate model (e.g., decision tree) on each bootstrap sample.
93 | * Final Prediction: Aggregate predictions from all models (e.g., majority voting for classification, averaging for regression)
94 |
95 | Advantages:
96 | ==========
97 | * Reduces variance and helps prevent overfitting.
98 | * Improves model robustness against noise in data.
99 |
100 | Disadvantages:
101 | =================
102 | * May not significantly improve performance if base learners are not diverse.
103 |
104 | Boosting
105 | ====================================
106 | * This is an ensemble technique aimed at improving the accuracy and stability of ML models
107 | * It is done by combining weak learners(models that perfrom slightly better than random chance) to create a strong learner
108 | * The strong learner is built in iterations with focus on misclassified instances.
109 |
110 | How Boosting Works:
111 | ===============
112 | * Sequential Learning: Models are trained sequentially, where each new model focuses on correcting errors made by previous models.
113 | * Weight Adjustment: Misclassified instances are given higher weights so that subsequent models pay more attention to them.
114 | * Final Prediction: Combine predictions from all models, typically using weighted voting or averaging
115 |
116 | Popular Boosting Algorithms:
117 | ==========
118 | * AdaBoost
119 | * Gradient Boosting
120 | * XGBoost
121 |
122 | Advantages:
123 | ==========
124 | * Often achieves high accuracy and performs well even with limited data.
125 | * Can handle various types of data and relationships.
126 |
127 | Disadvantages:
128 | =================
129 | * More prone to overfitting than bagging if not carefully tuned.
130 | * Requires careful tuning of parameters.
131 |
--------------------------------------------------------------------------------
/DataScience/LLMPrep.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/DataScience/MlAlgoCheatSheet.md:
--------------------------------------------------------------------------------
1 | ## Popular Algorithms in Data Science
2 |
3 | In data science, various algorithms are employed for tasks such as regression and classification. Each algorithm has associated loss functions and performance metrics that help evaluate its effectiveness. Below is a detailed overview of popular algorithms, their loss functions, performance metrics, and caveats for their use.
4 |
5 | ### 1. **Linear Regression**
6 | - **Loss Function:** Mean Squared Error (MSE)
7 | - **Performance Metrics:** R-squared (R²), Adjusted R², Mean Absolute Error (MAE)
8 | - **Caveats:** Sensitive to outliers; performs poorly when the relationship between features and target is non-linear.
9 |
10 | ### 2. **Logistic Regression**
11 | - **Loss Function:** Binary Cross-Entropy Loss (Log Loss)
12 | - **Performance Metrics:** Accuracy, Precision, Recall, F1 Score
13 | - **Caveats:** Assumes linearity between the independent variables and the log odds of the dependent variable; not suitable for multi-class problems without modification.
14 |
15 | ### 3. **Decision Trees**
16 | - **Loss Function:** Gini Impurity (for classification), Mean Squared Error (for regression)
17 | - **Performance Metrics:** Accuracy, Mean Absolute Error (MAE), Root Mean Squared Error (RMSE)
18 | - **Caveats:** Prone to overfitting; sensitive to small changes in data which can lead to different tree structures.
19 |
20 | ### 4. **Support Vector Machines (SVM)**
21 | - **Loss Function:** Hinge Loss (for classification), Epsilon-insensitive Loss (for regression)
22 | - **Performance Metrics:** Accuracy, Precision, Recall
23 | - **Caveats:** Computationally expensive for large datasets; requires careful tuning of hyperparameters like the kernel choice.
24 |
25 | ### 5. **Random Forest**
26 | - **Loss Function:** Mean Squared Error (for regression), Gini Impurity or Cross-Entropy Loss (for classification)
27 | - **Performance Metrics:** Out-of-Bag Error, Accuracy
28 | - **Caveats:** Can be less interpretable than simpler models; may require significant computational resources.
29 |
30 | ### 6. **Gradient Boosting Machines (GBM)**
31 | - **Loss Function:** Log Loss (for classification), Mean Squared Error (for regression)
32 | - **Performance Metrics:** Log-Likelihood, RMSE
33 | - **Caveats:** Sensitive to overfitting if not properly regularized; requires careful tuning of learning rate and tree depth.
34 |
35 | ### 7. **Neural Networks**
36 | - **Loss Function:** Cross-Entropy Loss (for classification), Mean Squared Error (for regression)
37 | - **Performance Metrics:** Accuracy, F1 Score, Area Under Curve (AUC)
38 | - **Caveats:** Requires large amounts of data; can be prone to overfitting if not regularized properly; less interpretable compared to traditional models.
39 |
40 | ### 8. **K-Means Clustering**
41 | - **Loss Function:** Sum of Squared Errors (SSE)
42 | - **Performance Metrics:** Silhouette Score, Davies-Bouldin Index
43 | - **Caveats:** Assumes spherical clusters; sensitive to initial centroid placement; requires specifying the number of clusters in advance.
44 |
45 | ## Summary of Loss Functions and Performance Metrics
46 |
47 | | Algorithm | Loss Function | Performance Metrics |
48 | |------------------------|------------------------------------|----------------------------------------|
49 | | Linear Regression | Mean Squared Error | R², MAE |
50 | | Logistic Regression | Binary Cross-Entropy | Accuracy, F1 Score |
51 | | Decision Trees | Gini Impurity / MSE | Accuracy, MAE |
52 | | Support Vector Machines | Hinge Loss | Accuracy, Precision |
53 | | Random Forest | MSE / Gini Impurity | Out-of-Bag Error |
54 | | Gradient Boosting | Log Loss / MSE | RMSE |
55 | | Neural Networks | Cross-Entropy / MSE | Accuracy, AUC |
56 | | K-Means Clustering | Sum of Squared Errors | Silhouette Score |
57 |
58 | ## Conclusion
59 |
60 | The choice of algorithm depends on the specific characteristics of the dataset and the nature of the problem being solved. Understanding the strengths and weaknesses of each algorithm helps in selecting the most appropriate one for a given task. For instance, while linear regression is simple and interpretable, it may not capture complex relationships in the data. Conversely, neural networks can model intricate patterns but require more data and computational power.
61 |
62 | Citations:
63 |
64 | [1] https://www.datacamp.com/tutorial/loss-function-in-machine-learning
65 | [2] https://builtin.com/machine-learning/common-loss-functions
66 | [3] https://www.ibm.com/think/topics/loss-function
67 | [4] https://neptune.ai/blog/performance-metrics-in-machine-learning-complete-guide
68 | [5] https://www.geeksforgeeks.org/ml-common-loss-functions/
69 | [6] https://www.linkedin.com/pulse/performance-metrics-loss-function-machine-learning-alok-choudhary-zou7c
70 | [7] https://towardsdatascience.com/estimators-loss-functions-optimizers-core-of-ml-algorithms-d603f6b0161a
71 |
--------------------------------------------------------------------------------
/DataScience/MlAlgoKeyFormulae.md:
--------------------------------------------------------------------------------
1 | ## Popular Algorithms in Data Science with Mathematical Formulations
2 |
3 | Here is an expanded overview of popular algorithms in data science, including their mathematical formulations, loss functions, performance metrics, and caveats.
4 |
5 | ### 1. **Linear Regression**
6 | - **Mathematical Formula:**
7 | $$ y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + ... + \beta_n x_n + \epsilon $$
8 | where $$y$$ is the dependent variable, $$x_i$$ are independent variables, $$\beta_i$$ are coefficients, and $$\epsilon$$ is the error term.
9 | - **Loss Function:** Mean Squared Error (MSE)
10 | $$ MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 $$
11 | - **Performance Metrics:** R-squared (R²), Adjusted R², Mean Absolute Error (MAE)
12 | - **Caveats:** Sensitive to outliers; performs poorly with non-linear relationships.
13 |
14 | ### 2. **Logistic Regression**
15 | - **Mathematical Formula:**
16 | $$ P(Y=1|X) = \frac{1}{1 + e^{-(\beta_0 + \beta_1 x_1 + ... + \beta_n x_n)}} $$
17 | - **Loss Function:** Binary Cross-Entropy Loss (Log Loss)
18 | $$ L = -\frac{1}{n} \sum_{i=1}^{n} [y_i \log(\hat{y}_i) + (1-y_i) \log(1-\hat{y}_i)] $$
19 | - **Performance Metrics:** Accuracy, Precision, Recall, F1 Score
20 | - **Caveats:** Assumes linearity in log odds; not suitable for multi-class without modification.
21 |
22 | ### 3. **Decision Trees**
23 | - **Mathematical Formula:**
24 | - For classification using Gini Impurity:
25 | $$ Gini(D) = 1 - \sum_{j=1}^{C} p_j^2 $$
26 | where $$p_j$$ is the proportion of class $$j$$ in dataset $$D$$.
27 | - For regression:
28 | $$ MSE(D) = \frac{1}{|D|} \sum_{i=1}^{|D|} (y_i - \bar{y})^2 $$
29 | where $$y_i$$ are the actual values and $$\bar{y}$$ is the mean of $$y$$.
30 | - **Loss Function:** Gini Impurity or Mean Squared Error
31 | - **Performance Metrics:** Accuracy, MAE
32 | - **Caveats:** Prone to overfitting; sensitive to data changes.
33 |
34 | ### 4. **Support Vector Machines (SVM)**
35 | - **Mathematical Formula:**
36 | $$ f(x) = w^T x + b $$
37 | where $$w$$ is the weight vector and $$b$$ is the bias.
38 | - **Loss Function:** Hinge Loss
39 | $$ L(y, f(x)) = \max(0, 1 - y f(x)) $$
40 | - **Performance Metrics:** Accuracy, Precision, Recall
41 | - **Caveats:** Computationally expensive for large datasets; requires careful tuning of hyperparameters.
42 |
43 | ### 5. **Random Forest**
44 | - **Mathematical Formula:**
45 | The prediction is made by averaging the predictions from multiple decision trees:
46 | $$ \hat{y} = \frac{1}{N} \sum_{i=1}^{N} T_i(x) $$
47 | where $$T_i$$ are individual trees.
48 | - **Loss Function:** Mean Squared Error or Gini Impurity
49 | - **Performance Metrics:** Out-of-Bag Error, Accuracy
50 | - **Caveats:** Less interpretable than single trees; requires significant computational resources.
51 |
52 | ### 6. **Gradient Boosting Machines (GBM)**
53 | - **Mathematical Formula:**
54 | $$ F(x) = F_{m-1}(x) + \gamma_m h_m(x) $$
55 | where $$h_m(x)$$ is the new tree added at iteration $$m$$.
56 | - **Loss Function:** Log Loss or Mean Squared Error
57 | - **Performance Metrics:** RMSE
58 | - **Caveats:** Sensitive to overfitting; requires careful tuning of learning rate and tree depth.
59 |
60 | ### 7. **Neural Networks**
61 | - **Mathematical Formula:**
62 | $$ y = f(WX + b) $$
63 | where $$W$$ are weights, $$X$$ is input data, and $$b$$ is bias.
64 | - **Loss Function:** Cross-Entropy Loss or Mean Squared Error
65 | - Cross-Entropy for classification:
66 | $$ L = -\frac{1}{n} \sum_{i=1}^{n} [y_i \log(\hat{y}_i)] $$
67 | - **Performance Metrics:** Accuracy, F1 Score, AUC
68 | - **Caveats:** Requires large amounts of data; less interpretable than traditional models.
69 |
70 | ### 8. **K-Means Clustering**
71 | - **Mathematical Formula:**
72 | $$ J = \sum_{i=1}^{k} \sum_{j=1}^{n} ||x_j^{(i)} - c_i||^2 $$
73 | where $$c_i$$ are centroids and $$x_j^{(i)}$$ are data points assigned to cluster $$i$$.
74 | - **Loss Function:** Sum of Squared Errors (SSE)
75 | - **Performance Metrics:** Silhouette Score, Davies-Bouldin Index
76 | - **Caveats:** Assumes spherical clusters; sensitive to initial centroid placement.
77 |
78 | ## Summary of Formulations
79 |
80 | | Algorithm | Mathematical Formula | Loss Function | Performance Metrics |
81 | |------------------------|-------------------------------------------------------------------------------------|------------------------------------|----------------------------------------|
82 | | Linear Regression | $$ y = \beta_0 + \beta_1 x_1 + ... + \beta_n x_n + \epsilon $$ | MSE | R², MAE |
83 | | Logistic Regression | $$ P(Y=1|X) = \frac{1}{1 + e^{-(\beta_0 + ... + \beta_n x_n)}} $$ | Binary Cross-Entropy | Accuracy, F1 Score |
84 | | Decision Trees | Gini: $$ Gini(D) = 1 - \sum p_j^2 $$ | Gini Impurity / MSE | Accuracy, MAE |
85 | | Support Vector Machines | $$ f(x) = w^T x + b $$ | Hinge Loss | Accuracy, Precision |
86 | | Random Forest | $$ \hat{y} = \frac{1}{N} \sum T_i(x) $$ | MSE / Gini Impurity | Out-of-Bag Error |
87 | | Gradient Boosting | $$ F(x) = F_{m-1}(x) + h_m(x) $$ | Log Loss / MSE | RMSE |
88 | | Neural Networks | $$ y = f(WX + b) $$ | Cross-Entropy / MSE | Accuracy, AUC |
89 | | K-Means Clustering | $$ J = \sum ||x_j^{(i)} - c_i||^2 $$ | SSE | Silhouette Score |
90 |
91 | This comprehensive overview provides insights into each algorithm's mathematical foundation along with its practical applications and limitations. Understanding these aspects can help in selecting the right algorithm for specific data science tasks.
92 |
93 | Citations:
94 | [1] https://www.kdnuggets.com/2020/01/decision-tree-algorithm-explained.html
95 | [2] http://fiascodata.blogspot.com/2018/08/decision-tree-mathematical-formulation.html
96 | [3] https://en.wikipedia.org/wiki/Decision_tree_learning
97 | [4] https://www.datascienceprophet.com/understanding-the-mathematics-behind-the-decision-tree-algorithm-part-i/
98 | [5] https://towardsdatascience.com/the-mathematics-of-decision-trees-random-forest-and-feature-importance-in-scikit-learn-and-spark-f2861df67e3?gi=36fa533e014a
99 | [6] https://www.datacamp.com/tutorial/loss-function-in-machine-learning
100 | [7] https://neptune.ai/blog/performance-metrics-in-machine-learning-complete-guide
101 | [8] https://towardsdatascience.com/estimators-loss-functions-optimizers-core-of-ml-algorithms-d603f6b0161a?gi=5432fa9d3888
102 |
--------------------------------------------------------------------------------
/DataScience/study_plan.md:
--------------------------------------------------------------------------------
1 | # Statistics
2 | * T test
3 | * Z test
4 | * ANOVA
5 | * Chi Square
6 | * Correlation
7 | * Covariance
8 | * Hypothesis Testing
9 |
10 | # Classic ML
11 | * Linear Regression
12 | * Logistic Regression
13 | * Regulazisation (Rigde and Lasso)
14 | * Cost Functions
15 | * Decision Tree
16 | * Random Forest
17 | * Ensemble Learning
18 | * Bagging and Boosting
19 | * XGBoost
20 | * LightGBM
21 |
22 | # Hyperparamter Tuning
23 | * Grid Search
24 | * Random Search
25 | * HyperOpt
26 | * Feature Selection - PCA
27 |
28 | # Normmaliztion
29 | * Imbalance Dataaet
30 | * Imputing Missing data
31 | * Handling Outliers
32 | * Cross Validation
33 |
34 | # Clustering
35 | * K-Means clsutering
36 | * KNN
37 | * Principal Component Analysis
38 |
39 | # Perfromance Measures
40 | * R-square
41 | * Adjusted R-square
42 | * Mean Square Error
43 | * Root Mean Square Error
44 | * MAPE
45 | * Mean Absolute Error
46 |
47 | * Recall
48 | * Precision
49 | * Accuracy
50 | * F1-Score
51 | * ROC-AUC
52 | * Confusion Matrix
53 |
54 | * Type1 Error
55 | * Type2 Error
56 | * True Positive Rate
57 | * False Positive Rate
58 |
59 |
60 | # Adavnced ML
61 | * CNN
62 | * RCNN
63 | * LSTM
64 | * Transfromers
65 | * BERT
66 |
67 |
68 | # Time Series
69 | * Trend
70 | * Seasonality
71 | * Irregualrity
72 | * Cyclicity
73 | * Stationality
74 | * ADF
75 | * Making data stationary
76 | * White Noise
77 | * Holt Winters
78 | * FB-Prophet
79 |
80 |
81 | # Drift Detection
82 | * Type of drifts
83 | * KS Test
84 | * KL Divergence
85 | * Wassertein distance
86 | * ADWIN
87 |
88 | # NLP
89 | * Stemming
90 | * Lemmatization
91 | * TF-IDF
92 | * Word2Vec
93 | * Bag of Words models
94 | * Spacy
95 |
96 | # MLOPS
97 | * MLFlow
98 | * Model Registry
99 | * Data Versioning
100 | * Artifacts
101 |
102 |
103 |
104 |
--------------------------------------------------------------------------------
/GenerativeAI/1.1.What does generative truly mean.md:
--------------------------------------------------------------------------------
1 | In the context of deep learning, **generative** refers to models that are capable of generating new data samples that are similar to the training data they were trained on. These models learn the underlying probability distribution of the training data and use it to create novel samples[1][2].
2 |
3 | The key principles behind generative deep learning models are:
4 |
5 | ## Learning the Data Distribution
6 |
7 | Generative models learn the probability distribution of the training data. This allows them to generate new samples that are statistically similar to the original data[2].
8 |
9 | ## Sampling from the Learned Distribution
10 |
11 | Once the model has learned the data distribution, it can sample from this distribution to generate new samples. This sampling process introduces randomness, which allows the model to produce varied outputs for the same input[1].
12 |
13 | ## Adversarial Training (GANs)
14 |
15 | One popular type of generative model is the Generative Adversarial Network (GAN). GANs consist of two neural networks - a generator and a discriminator. The generator generates new samples, while the discriminator tries to distinguish between real and generated samples. Through this adversarial training process, the generator learns to produce more realistic samples that can fool the discriminator[2].
16 |
17 | ## Variational Autoencoders (VAEs)
18 |
19 | Another important class of generative models are Variational Autoencoders (VAEs). VAEs learn a latent representation of the data and use this representation to generate new samples. They are trained to maximize the likelihood of the training data under the learned generative model[3].
20 |
21 | In summary, generative deep learning models learn the underlying probability distribution of the training data and use this knowledge to generate novel samples that are statistically similar to the original data. This allows them to create impressive outputs like realistic images, coherent text, and plausible audio[3][4][5].
22 |
23 | Citations:
24 | [1] https://www.cmu.edu/intelligentbusiness/expertise/genai-principles.pdf
25 | [2] https://www.sixsigmacertificationcourse.com/the-basic-principles-of-generative-models-with-an-example/
26 | [3] https://www.shroffpublishers.com/books/9789355429988/
27 | [4] https://www.amazon.in/Generative-Deep-Learning-David-Foster-ebook/dp/B0C3WVJWBF
28 | [5] https://www.amazon.in/Deep-Learning-Scratch-Building-Principles/dp/935213902X
29 |
--------------------------------------------------------------------------------
/GenerativeAI/1.2 Next Word Prediction.md:
--------------------------------------------------------------------------------
1 | Next word prediction is a fundamental task in Natural Language Processing (NLP) that involves predicting the most likely word to follow a given sequence of words. This task has evolved significantly with the advent of deep learning models, particularly the Transformer architecture, which has transformed the landscape of NLP.
2 |
3 | ## Evolution of Next Word Prediction Models
4 |
5 | ### Early Models: RNNs, LSTMs, and GRUs
6 |
7 | Before the introduction of Transformers, next word prediction was primarily handled by Recurrent Neural Networks (RNNs) and their variants, such as Long Short-Term Memory (LSTM) networks and Gated Recurrent Units (GRU).
8 |
9 | - **RNNs** maintain hidden states that capture information from previous inputs, allowing them to process sequences of data. However, they often struggle with long-range dependencies due to issues like the vanishing gradient problem.
10 |
11 | - **LSTMs** were designed to overcome these limitations by introducing memory cells that can store and retrieve information over longer sequences, making them effective for capturing long-term dependencies.
12 |
13 | - **GRUs** simplify the LSTM architecture by merging the cell state and hidden state, providing a more computationally efficient alternative while still managing to capture long-range dependencies effectively[1].
14 |
15 | These models laid the groundwork for understanding sequential data and context in language, but they were limited by their sequential processing nature, which hindered parallelization and scalability.
16 |
17 | ## The Transformer Architecture
18 |
19 | Introduced in the groundbreaking paper "Attention Is All You Need" by Vaswani et al. in 2017, the Transformer model revolutionized next word prediction by eliminating the recurrence mechanism entirely. Instead, it relies on a self-attention mechanism that allows it to process all words in a sequence simultaneously, capturing relationships between words regardless of their distance from each other in the text.
20 |
21 | ### Key Components of Transformers
22 |
23 | 1. **Self-Attention Mechanism**: This mechanism allows the model to weigh the importance of different words in the input sequence when making predictions. Each word can attend to all other words, enabling the model to capture complex dependencies and contextual relationships effectively.
24 |
25 | 2. **Positional Encoding**: Since Transformers do not process sequences in order, they use positional encodings to retain information about the position of words within the sequence. This helps the model understand the order of words, which is crucial for language comprehension.
26 |
27 | 3. **Encoder-Decoder Structure**: The Transformer consists of an encoder that processes the input sequence and a decoder that generates the output sequence. Each encoder and decoder layer employs self-attention and feed-forward networks, allowing for efficient learning of language patterns[2][3].
28 |
29 | ### Advantages of Transformers
30 |
31 | Transformers offer several advantages over previous models:
32 |
33 | - **Parallelization**: Unlike RNNs, which process inputs sequentially, Transformers can process entire sequences simultaneously, significantly speeding up training.
34 |
35 | - **Long-Range Dependencies**: The self-attention mechanism enables better handling of long-range dependencies, allowing the model to consider the entire context when predicting the next word.
36 |
37 | - **Scalability**: Transformers can be scaled up easily, leading to the development of large language models (LLMs) like GPT-3 and BERT, which have demonstrated remarkable performance across various NLP tasks, including next word prediction[4][5].
38 |
39 | ## Conclusion
40 |
41 | The transition from RNNs and their variants to the Transformer architecture marks a significant advancement in next word prediction capabilities. Transformers have not only improved the efficiency and accuracy of predictions but have also paved the way for the development of sophisticated language models that can understand and generate human-like text. This evolution underscores the importance of architectural innovations in enhancing the performance of NLP applications.
42 |
43 | Citations:
44 | [1] https://www.geeksforgeeks.org/next-word-prediction-with-deep-learning-in-nlp/
45 | [2] https://datasciencedojo.com/blog/transformer-models/
46 | [3] https://en.wikipedia.org/wiki/Transformer_%28machine_learning_model%29
47 | [4] https://www.leewayhertz.com/decision-transformer/
48 | [5] https://towardsdatascience.com/transformers-141e32e69591
49 | [6] https://www.datacamp.com/tutorial/how-transformers-work
50 | [7] https://www.geeksforgeeks.org/getting-started-with-transformers/
51 | [8] https://www.techscience.com/cmc/v78n3/55891/html
52 |
--------------------------------------------------------------------------------
/GenerativeAI/1.3 Embedding Process. - Mathematical Intution.md:
--------------------------------------------------------------------------------
1 | ## The Intuition Behind Embeddings in Transformers
2 |
3 | Embeddings are a fundamental component of Transformer models, allowing them to represent words and tokens as numerical vectors that can be processed by neural networks. The embedding process maps discrete tokens (like words) into a continuous vector space, where similar tokens are positioned close together. This embedding space captures semantic and syntactic relationships between tokens.
4 |
5 | Some key characteristics of embeddings that make them useful for Transformers:
6 |
7 | ### Continuous Representation
8 | Embeddings represent tokens as continuous vectors, rather than discrete indices. This allows the model to learn smooth relationships between tokens, enabling better generalization.
9 |
10 | ### Dimensionality Reduction
11 | High-dimensional one-hot encoded token representations are mapped to a much lower dimensional embedding space (e.g. 300 dimensions). This dimensionality reduction allows the model to efficiently process and store token representations.
12 |
13 | ### Semantic Relationships
14 | The embedding space encodes semantic relationships between tokens. For example, the vector for "king" - "man" + "woman" points to the vector for "queen"[1]. These relationships emerge from the training data.
15 |
16 | ### Parallelization
17 | Embeddings allow the model to process all tokens in parallel, rather than sequentially. This is important for the self-attention mechanism in Transformers, which computes relationships between all pairs of tokens[3].
18 |
19 | ### Transfer Learning
20 | Pre-trained embeddings, like those from BERT, can be fine-tuned on specific tasks. The embeddings capture general language knowledge that can be leveraged for various applications[5].
21 |
22 | ### Intuitive Visualization
23 | Embeddings can be visualized in 2D or 3D space to gain intuitions about the model's internal representations. Semantically similar tokens cluster together in the embedding space[4].
24 |
25 | Mathematically, an embedding space is a manifold in which similar items are positioned closer to one another than dissimilar items[6]. The embedding process maps discrete tokens to points on this manifold, preserving semantic relationships. Transformers leverage these properties of embeddings to efficiently process and reason about language.
26 |
27 | Citations:
28 | [1] https://towardsdatascience.com/analyzing-transformers-in-embedding-space-explained-ef72130a6844?gi=ecd132be68ed
29 | [2] https://news.ycombinator.com/item?id=40497379
30 | [3] https://towardsdatascience.com/transformers-intuitively-and-exhaustively-explained-58a5c5df8dbb
31 | [4] https://encord.com/blog/embeddings-machine-learning/
32 | [5] https://www.datacamp.com/tutorial/how-transformers-work
33 | [6] https://stackoverflow.blog/2023/11/09/an-intuitive-introduction-to-text-embeddings/
34 | [7] https://www.ibm.com/think/topics/vector-embedding
35 | [8] https://www.geeksforgeeks.org/word-embeddings-in-nlp/
36 |
--------------------------------------------------------------------------------
/GenerativeAI/1.4 Attention Block - Python Example.md:
--------------------------------------------------------------------------------
1 | The attention mechanism in Transformers is a powerful mathematical framework that enables models to focus on different parts of the input sequence, allowing for better understanding of context and relationships within the data. This is particularly useful in tasks such as natural language processing and image recognition.
2 |
3 | ## Mathematical Intuition of Attention Block
4 |
5 | ### Key Concepts
6 |
7 | 1. **Queries, Keys, and Values**: In the context of attention, each input is transformed into three vectors:
8 | - **Query (Q)**: Represents the item for which we want to find relevant information.
9 | - **Key (K)**: Represents the items in the input that can provide information.
10 | - **Value (V)**: Represents the actual information associated with each key.
11 |
12 | 2. **Scaled Dot-Product Attention**: The attention score between queries and keys is computed using the dot product, scaled by the square root of the dimension of the key vectors, followed by a softmax operation to obtain attention weights. The output is then a weighted sum of the value vectors.
13 |
14 | The formula for the attention mechanism can be summarized as:
15 |
16 | $$
17 | \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
18 | $$
19 |
20 | where $$d_k$$ is the dimension of the key vectors.
21 |
22 | 3. **Multi-Head Attention**: Instead of performing a single attention function, multiple attention heads are used. Each head learns different representations by applying the attention mechanism independently and then concatenating their outputs.
23 |
24 | ### End-to-End Process Example
25 |
26 | To illustrate the attention mechanism, we can implement a simple version using Python and NumPy. Below is a step-by-step example.
27 |
28 | ```python
29 | import numpy as np
30 |
31 | # Define input dimensions
32 | d_model = 4 # Dimension of the model
33 | d_k = 2 # Dimension of keys and queries
34 | d_v = 2 # Dimension of values
35 | num_heads = 2 # Number of attention heads
36 |
37 | # Sample input data (3 tokens in the sequence, each represented by a vector of size d_model)
38 | X = np.array([[1, 0, 1, 0],
39 | [0, 1, 0, 1],
40 | [1, 1, 1, 1]])
41 |
42 | # Randomly initialize weight matrices for queries, keys, and values
43 | W_Q = np.random.rand(d_model, d_k)
44 | W_K = np.random.rand(d_model, d_k)
45 | W_V = np.random.rand(d_model, d_v)
46 |
47 | # Compute queries, keys, and values. @ is the Matrix Multiplication Op.
48 | Q = X @ W_Q
49 | K = X @ W_K
50 | V = X @ W_V
51 |
52 | # Compute attention scores
53 | scores = Q @ K.T / np.sqrt(d_k) # Scale scores
54 | attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) # Softmax
55 |
56 | # Compute output
57 | output = attention_weights @ V
58 |
59 | print("Queries:\n", Q)
60 | print("Keys:\n", K)
61 | print("Values:\n", V)
62 | print("Attention Weights:\n", attention_weights)
63 | print("Output:\n", output)
64 | ```
65 |
66 | ### Explanation of the Code
67 |
68 | 1. **Input Data**: We define a simple input matrix `X` representing three tokens, each with a feature vector of size `d_model`.
69 |
70 | 2. **Weight Matrices**: Random weight matrices `W_Q`, `W_K`, and `W_V` are initialized for transforming the input into queries, keys, and values.
71 |
72 | 3. **Computing Q, K, V**: The input matrix is multiplied by the corresponding weight matrices to obtain the queries, keys, and values.
73 |
74 | 4. **Attention Scores**: The dot product of queries and keys is computed, scaled, and passed through a softmax function to obtain attention weights.
75 |
76 | 5. **Output Calculation**: The final output is computed as a weighted sum of the values based on the attention weights.
77 |
78 | This example demonstrates the core functionality of the attention mechanism, capturing the relationships between different tokens in the input sequence. The multi-head attention can be implemented similarly by repeating the process for multiple sets of weight matrices and concatenating the results.
79 |
80 | Citations:
81 | [1] https://learnopencv.com/attention-mechanism-in-transformer-neural-networks/
82 | [2] https://transformer-circuits.pub/2021/framework/index.html
83 | [3] https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html
84 | [4] https://machinelearningmastery.com/the-transformer-attention-mechanism/
85 | [5] https://towardsdatascience.com/the-math-behind-multi-head-attention-in-transformers-c26cba15f625
86 | [6] https://nlp.seas.harvard.edu/2018/04/03/attention.html
87 | [7] https://www.youtube.com/watch?v=kO0XdAsY5YA
88 | [8] https://towardsdatascience.com/transformers-intuitively-and-exhaustively-explained-58a5c5df8dbb
89 |
--------------------------------------------------------------------------------
/GenerativeAI/1.5 MLP Block - Python Example.md:
--------------------------------------------------------------------------------
1 | ## Multi-Layer Perceptron (MLP) in Transformers
2 |
3 | The Multi-Layer Perceptron (MLP) is a key component of the Transformer architecture, responsible for refining the representation of each token using a non-linear transformation. Here's the mathematical intuition behind the MLP in Transformers:
4 |
5 | ### Mathematical Formulation
6 |
7 | The MLP in Transformers operates across the features of each token, applying the same non-linear transformation to each token independently. Given the output of the self-attention layer `y(m)_n` for token `n` at layer `m`, the MLP computes:
8 |
9 | $$
10 | x^{(m+1)}_n = \text{MLP}_\theta(y^{(m)}_n)
11 | $$
12 |
13 | where `\theta` represents the parameters of the MLP, which are shared across all tokens.
14 |
15 | The MLP typically consists of one or two hidden layers with a dimension equal to the number of features `D` (or larger). The computational cost of this step is roughly `N * D * D`, where `N` is the sequence length.
16 |
17 | ### Example Implementation in Python and NumPy
18 |
19 | Here's a simple example of implementing the MLP component in Transformers using Python and NumPy:
20 |
21 | ```python
22 | import numpy as np
23 |
24 | # Define MLP parameters
25 | D = 4 # Number of features
26 | hidden_size = 8 # Size of the hidden layer
27 |
28 | # Sample input from the self-attention layer
29 | y = np.array([[1, 0, 1, 0],
30 | [0, 1, 0, 1],
31 | [1, 1, 1, 1]])
32 |
33 | # Initialize MLP weights
34 | W1 = np.random.rand(D, hidden_size)
35 | b1 = np.random.rand(1, hidden_size)
36 | W2 = np.random.rand(hidden_size, D)
37 | b2 = np.random.rand(1, D)
38 |
39 | # Compute MLP output
40 | h = np.maximum(0, y @ W1 + b1) # ReLU activation in the hidden layer
41 | x = h @ W2 + b2 # Linear output layer
42 |
43 | print("Input from self-attention layer:\n", y)
44 | print("Output of the MLP:\n", x)
45 | ```
46 |
47 | In this example:
48 |
49 | 1. We define the MLP parameters, including the number of features `D` and the size of the hidden layer.
50 |
51 | 2. We create a sample input `y` from the self-attention layer.
52 |
53 | 3. We initialize the weights and biases of the MLP randomly.
54 |
55 | 4. We compute the output of the MLP by applying the following steps:
56 | - Compute the hidden layer activation using a ReLU non-linearity.
57 | - Apply the output layer weights and biases to obtain the final output.
58 |
59 | 5. Finally, we print the input from the self-attention layer and the output of the MLP.
60 |
61 | The MLP in Transformers acts as a non-linear feature extractor, processing the output of the self-attention layer independently for each token. It helps capture complex interactions between features and refine the representations learned by the self-attention mechanism.
62 |
63 | Citations:
64 | [1] https://www.youtube.com/watch?v=kO0XdAsY5YA
65 | [2] https://transformer-circuits.pub/2021/framework/index.html
66 | [3] https://arxiv.org/abs/2304.10557
67 | [4] https://learnopencv.com/attention-mechanism-in-transformer-neural-networks/
68 | [5] https://arxiv.org/pdf/2304.10557.pdf
69 | [6] https://www.youtube.com/watch?v=idVm0DMaDR4
70 | [7] https://towardsdatascience.com/the-math-behind-multi-head-attention-in-transformers-c26cba15f625
71 | [8] https://www.youtube.com/watch?v=qw7wFGgNCSU
72 |
--------------------------------------------------------------------------------
/GenerativeAI/1.6 Positional Encoding - Python Example.md:
--------------------------------------------------------------------------------
1 | ## Positional Encoding in Transformers
2 |
3 | Positional encoding is a critical component of the Transformer architecture, designed to provide information about the position of tokens in a sequence. Unlike recurrent neural networks (RNNs), which inherently process sequences in order, Transformers process all tokens in parallel. This parallel processing means that Transformers lack an inherent understanding of the order of tokens, making positional encodings essential.
4 |
5 | ### Mathematical Intuition
6 |
7 | The primary goal of positional encoding is to inject information about the position of each token in the input sequence. The positional encoding for a token at position $$ p $$ in a sequence is defined using sine and cosine functions of varying frequencies, as follows:
8 |
9 | - For even indices:
10 | $$
11 | PE(p, 2i) = \sin\left(\frac{p}{10000^{2i/d_{\text{model}}}}\right)
12 | $$
13 |
14 | - For odd indices:
15 | $$
16 | PE(p, 2i+1) = \cos\left(\frac{p}{10000^{2i/d_{\text{model}}}}\right)
17 | $$
18 |
19 | Where:
20 | - $$ p $$ is the position of the token in the sequence.
21 | - $$ i $$ is the dimension index.
22 | - $$ d_{\text{model}} $$ is the total number of dimensions in the embedding.
23 |
24 | This formulation allows each position to have a unique encoding, and the use of sine and cosine functions ensures that the positional encodings can capture relative positions. The geometric progression of frequencies allows the model to learn to attend to relative positions effectively.
25 |
26 | ### End-to-End Process Example
27 |
28 | To illustrate how positional encoding works in practice, we can implement it using Python and NumPy. Below is a step-by-step example.
29 |
30 | ```python
31 | import numpy as np
32 |
33 | def positional_encoding(max_len, d_model):
34 | # Initialize the positional encoding matrix
35 | pos_enc = np.zeros((max_len, d_model))
36 |
37 | # Compute positional encodings
38 | for p in range(max_len):
39 | for i in range(0, d_model, 2):
40 | pos_enc[p, i] = np.sin(p / (10000 ** (2 * i / d_model)))
41 | if i + 1 < d_model:
42 | pos_enc[p, i + 1] = np.cos(p / (10000 ** (2 * i / d_model)))
43 |
44 | return pos_enc
45 |
46 | # Example parameters
47 | max_len = 10 # Maximum length of the input sequence
48 | d_model = 4 # Dimension of the embedding
49 |
50 | # Compute positional encodings
51 | pos_encodings = positional_encoding(max_len, d_model)
52 |
53 | print("Positional Encodings:\n", pos_encodings)
54 | ```
55 |
56 | ### Explanation of the Code
57 |
58 | 1. **Function Definition**: The `positional_encoding` function takes two parameters: `max_len` (the maximum length of the input sequence) and `d_model` (the dimensionality of the embedding).
59 |
60 | 2. **Matrix Initialization**: A zero matrix `pos_enc` is initialized to store the positional encodings.
61 |
62 | 3. **Computing Encodings**: Two nested loops iterate over each position $$ p $$ and dimension $$ i $$:
63 | - For even indices, the sine function is applied.
64 | - For odd indices, the cosine function is applied.
65 |
66 | 4. **Output**: The resulting positional encodings matrix is printed, showing the positional information for each position in the sequence.
67 |
68 | ### Summary
69 |
70 | Positional encoding is essential in the Transformer architecture, allowing the model to incorporate information about the order of tokens in a sequence. By using sine and cosine functions, positional encodings provide unique representations for each position, enabling the model to learn relationships between tokens effectively. This approach enhances the model's ability to process sequences without losing the critical information of token order.
71 |
72 | Citations:
73 | [1] https://www.geeksforgeeks.org/positional-encoding-in-transformers/
74 | [2] https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/
75 | [3] https://kazemnejad.com/blog/transformer_architecture_positional_encoding/
76 | [4] https://www.youtube.com/watch?v=kO0XdAsY5YA
77 | [5] https://nlp.seas.harvard.edu/2018/04/03/attention.html
78 | [6] https://www.linkedin.com/pulse/deep-dive-positional-encodings-transformer-neural-network-ajay-taneja
79 | [7] https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html
80 | [8] https://www.youtube.com/watch?v=ZMxVe-HK174
81 |
--------------------------------------------------------------------------------
/GenerativeAI/1.7 End to End process of Attention.md:
--------------------------------------------------------------------------------
1 | To illustrate the functioning of the attention block in Transformers, let's break down the process using a sample sentence, its embedding vector, and the attention mechanism step-by-step. We will also show how to compute the probability distribution of the next best word based on the attention scores.
2 |
3 | ### Example Sentence
4 |
5 | Let's take the sentence: **"Life is short"**.
6 |
7 | ### Step 1: Word Embedding
8 |
9 | First, we need to convert the words into embedding vectors. For simplicity, we'll use random embeddings for each word.
10 |
11 | ```python
12 | import numpy as np
13 |
14 | # Define the sentence and create a dictionary for word indices
15 | sentence = "Life is short"
16 | words = sentence.split()
17 | word_to_index = {word: i for i, word in enumerate(words)}
18 |
19 | # Create random embeddings for each word
20 | embedding_dim = 4 # Dimension of the embedding
21 | embeddings = np.random.rand(len(words), embedding_dim)
22 |
23 | print("Word Indices:", word_to_index)
24 | print("Word Embeddings:\n", embeddings)
25 | ```
26 |
27 | ### Step 2: Compute Queries, Keys, and Values
28 |
29 | In the attention mechanism, we need to compute the queries (Q), keys (K), and values (V) from the embeddings. We will use learned weight matrices for this purpose.
30 |
31 | ```python
32 | # Initialize weight matrices for Q, K, and V
33 | W_Q = np.random.rand(embedding_dim, embedding_dim)
34 | W_K = np.random.rand(embedding_dim, embedding_dim)
35 | W_V = np.random.rand(embedding_dim, embedding_dim)
36 |
37 | # Compute Q, K, V
38 | Q = embeddings @ W_Q
39 | K = embeddings @ W_K
40 | V = embeddings @ W_V
41 |
42 | print("Queries (Q):\n", Q)
43 | print("Keys (K):\n", K)
44 | print("Values (V):\n", V)
45 | ```
46 |
47 | ### Step 3: Compute Attention Scores
48 |
49 | Next, we calculate the attention scores using the dot product of the queries and keys, followed by a softmax to obtain the attention weights.
50 |
51 | ```python
52 | # Compute attention scores
53 | scores = Q @ K.T / np.sqrt(embedding_dim) # Scale by the square root of the dimension
54 | attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) # Softmax
55 |
56 | print("Attention Scores:\n", scores)
57 | print("Attention Weights:\n", attention_weights)
58 | ```
59 |
60 | ### Step 4: Compute Output of the Attention Block
61 |
62 | The output of the attention block is computed as a weighted sum of the values, using the attention weights.
63 |
64 | ```python
65 | # Compute the output of the attention block
66 | output = attention_weights @ V
67 |
68 | print("Output of Attention Block:\n", output)
69 | ```
70 |
71 | ### Step 5: Probability Distribution for Next Word
72 |
73 | To predict the next word, we can apply a simple linear layer followed by a softmax function to the output of the attention block. This simulates how we would generate probabilities for the next word in a sequence.
74 |
75 | ```python
76 | # Initialize weights for the output layer
77 | W_out = np.random.rand(embedding_dim, len(words))
78 |
79 | # Compute logits
80 | logits = output @ W_out
81 |
82 | # Compute probabilities using softmax
83 | probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
84 |
85 | print("Logits:\n", logits)
86 | print("Probability Distribution for Next Word:\n", probabilities)
87 | ```
88 |
89 | ### Summary of the Process
90 |
91 | 1. **Word Embedding**: Convert words into embedding vectors.
92 | 2. **Compute Q, K, V**: Use learned weight matrices to compute queries, keys, and values from the embeddings.
93 | 3. **Attention Scores**: Calculate scores using the dot product of queries and keys, then apply softmax to obtain attention weights.
94 | 4. **Output of Attention Block**: Compute the output as a weighted sum of the values based on the attention weights.
95 | 5. **Next Word Probability**: Generate a probability distribution for the next word using a linear transformation followed by softmax.
96 |
97 | ### Final Output
98 |
99 | The final output will show the probability distribution of the next best word based on the attention mechanism applied to the input sentence. This allows the model to capture the context and relationships between the words effectively.
100 |
101 | Citations:
102 | [1] https://nlp.gluon.ai/examples/sentence_embedding/self_attentive_sentence_embedding.html
103 | [2] https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html
104 | [3] https://datascience.stackexchange.com/questions/95134/how-to-encode-a-sentence-using-an-attention-mechanism
105 | [4] https://towardsdatascience.com/contextual-transformer-embeddings-using-self-attention-explained-with-diagrams-and-python-code-d7a9f0f4d94e?gi=0dee21177e82
106 | [5] https://github.com/gazelle93/Transformer-Various-Positional-Encoding
107 | [6] https://www.linkedin.com/pulse/deep-dive-positional-encodings-transformer-neural-network-ajay-taneja
108 | [7] https://towardsdatascience.com/all-you-need-to-know-about-attention-and-transformers-in-depth-understanding-part-1-552f0b41d021?gi=4b6a109307fe
109 | [8] https://stackoverflow.blog/2023/11/09/an-intuitive-introduction-to-text-embeddings/
110 |
--------------------------------------------------------------------------------
/GenerativeAI/2. How does FAISS work.md:
--------------------------------------------------------------------------------
1 | Here is an explanation of how Facebook AI Similarity Search (FAISS) works, from beginner to advanced, using the example query vector "I like to play football":
2 |
3 | ## Beginner Explanation
4 |
5 | FAISS is a library that allows you to quickly find similar items in a large dataset of vectors. For example, if you have a sentence embedding vector for the query "I like to play football", FAISS can efficiently search through millions or billions of other sentence embedding vectors to find the ones that are most similar.
6 |
7 | To use FAISS, you first need to create an index from your dataset of vectors. This involves some preprocessing to optimize the index for fast similarity search. Then, when you have a query vector like "I like to play football", you can pass it to FAISS to search the index and get back the most similar vectors, ranked by similarity score.
8 |
9 | FAISS uses techniques like quantization and efficient distance computation to make the search much faster than a brute force approach of comparing the query to every vector in the dataset one by one.
10 |
11 | ## Intermediate Explanation
12 |
13 | Let's say you have a dataset of 1 billion sentence embedding vectors, and you want to find the 10 most similar vectors to "I like to play football". Here's how FAISS would work:
14 |
15 | 1. **Preprocessing**: FAISS builds an index data structure from the 1 billion vectors. This involves partitioning the vectors into clusters and encoding them using product quantization to reduce memory usage[1][2].
16 |
17 | 2. **Searching**: When you pass the query vector "I like to play football" to FAISS, it first identifies which clusters the query is closest to. It then only compares the query to the vectors within those clusters, rather than all 1 billion vectors[3].
18 |
19 | 3. **Ranking**: FAISS computes the similarity scores between the query and the vectors in the relevant clusters. It returns the 10 vectors with the highest scores, which are the most similar to the query[4].
20 |
21 | FAISS is highly optimized for this process, using techniques like multi-threading and GPU acceleration to make the search extremely fast, even on a dataset of 1 billion vectors[1][5].
22 |
23 | ## Advanced Explanation
24 |
25 | Under the hood, FAISS uses advanced indexing algorithms to enable efficient similarity search. Some key components are:
26 |
27 | - **Inverted file index (IVF)**: This partitions the vector space into Voronoi cells. For a given query, FAISS first identifies the cells it is closest to, narrowing down the search[3].
28 |
29 | - **Product quantization (PQ)**: Vectors are decomposed into subvectors which are quantized separately. This allows FAISS to store an approximation of the vectors very compactly in RAM[1][2].
30 |
31 | - **Hierarchical navigable small world (HNSW) graph**: An efficient nearest neighbor graph structure that allows fast traversal to find similar vectors[4].
32 |
33 | FAISS provides a range of indexing algorithms that make different time/accuracy/memory tradeoffs. The most accurate is IVF with PQ, which is what FAISS would likely use for a query like "I like to play football" on a large dataset[1][2][3].
34 |
35 | By leveraging these advanced indexing techniques, FAISS is able to provide state-of-the-art similarity search performance, enabling applications like efficient semantic search, personalized recommendations, and content-based retrieval[4][5].
36 |
37 | Citations:
38 | [1] https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/
39 | [2] https://ai.meta.com/tools/faiss/
40 | [3] https://www.pinecone.io/learn/series/faiss/faiss-tutorial/
41 | [4] https://www.activeloop.ai/resources/glossary/faiss-facebook-ai-similarity-search/
42 | [5] https://www.linkedin.com/pulse/exploring-power-facebook-ai-similarity-search-library-venkatesh-mungi-6ncof
43 |
--------------------------------------------------------------------------------
/GenerativeAI/3. FAISS Advanced explaination.md:
--------------------------------------------------------------------------------
1 | To expand on the advanced explanation of Facebook AI Similarity Search (FAISS) and incorporate mathematical expressions, we will delve into the underlying mechanisms and algorithms used in FAISS, using the example query vector "I like to play football".
2 |
3 | ## Advanced Explanation of FAISS
4 |
5 | FAISS is designed for efficient similarity search and clustering of dense vectors, typically in high-dimensional spaces. The core idea is to index a large dataset of vectors so that we can quickly retrieve the most similar vectors to a given query vector.
6 |
7 | ### Key Components of FAISS
8 |
9 | 1. **Vector Representation**:
10 | Each sentence or item is represented as a vector in a high-dimensional space. For example, the sentence "I like to play football" might be encoded into a vector $$\mathbf{q}$$ of dimension $$d$$ (e.g., $$d = 768$$ for sentence embeddings).
11 |
12 | 2. **Distance Metrics**:
13 | FAISS supports various distance metrics for measuring similarity between vectors, including:
14 |
15 | - **L2 (Euclidean) Distance**:
16 | $$
17 | D(\mathbf{x}, \mathbf{y}) = \sqrt{\sum_{i=1}^{d} (x_i - y_i)^2}
18 | $$
19 | - **Inner Product** (used for cosine similarity when vectors are normalized):
20 | $$
21 | D(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^{d} x_i \cdot y_i
22 | $$
23 |
24 | 3. **Index Structures**:
25 | FAISS employs several indexing strategies to optimize search performance:
26 |
27 | - **Flat Index**: This is the simplest form, where all vectors are stored, and the search is performed using brute force. For a query vector $$\mathbf{q}$$, the search involves calculating the distance to every vector in the index.
28 |
29 | - **Inverted File Index (IVF)**: This partitions the vector space into clusters. Each cluster is represented by a centroid, and vectors are assigned to these clusters. The search process involves:
30 | 1. **Cluster Assignment**: For a query vector $$\mathbf{q}$$, find the nearest centroids using a coarse quantizer (e.g., using L2 distance).
31 | 2. **Refined Search**: Only search within the nearest clusters.
32 |
33 | - **Product Quantization (PQ)**: This technique compresses the vector representation to save memory. It divides each vector into $$M$$ subvectors and quantizes each subvector separately. The distance computation for a query vector $$\mathbf{q}$$ involves:
34 | $$
35 | D(\mathbf{q}, \mathbf{c}) \approx \sum_{m=1}^{M} D(\mathbf{q}_m, \mathbf{c}_m)
36 | $$
37 | where $$\mathbf{c}_m$$ is the quantized representation of the $$m^{th}$$ subvector.
38 |
39 | - **Hierarchical Navigable Small World (HNSW)**: This is a graph-based approach that allows for fast nearest neighbor searches. It constructs a multi-layer graph where each layer contains a subset of the vectors, enabling efficient traversal to find nearest neighbors.
40 |
41 | ### Example Search Process
42 |
43 | 1. **Index Creation**:
44 | Suppose we have a dataset of vectors representing various sentences, including our example. We would first create an index:
45 | ```python
46 | import faiss
47 | d = 768 # Example dimension
48 | index = faiss.IndexIVFPQ(faiss.IndexFlatL2(d), d, nlist=100, M=16, nbits=8)
49 | index.train(training_vectors) # Train the index with a subset of vectors
50 | index.add(vectors) # Add all vectors to the index
51 | ```
52 |
53 | 2. **Query Vector**:
54 | For the query "I like to play football", we encode it into a vector $$\mathbf{q}$$:
55 | ```python
56 | xq = model.encode(["I like to play football"])
57 | ```
58 |
59 | 3. **Search Execution**:
60 | To find the 4 nearest neighbors, we execute:
61 | ```python
62 | k = 4
63 | D, I = index.search(xq, k) # D contains distances, I contains indices of neighbors
64 | ```
65 |
66 | 4. **Distance Calculation**:
67 | The distances $$D$$ are computed using the selected metric (e.g., L2 distance or inner product) based on the indexed vectors.
68 |
69 | 5. **Results Interpretation**:
70 | The indices returned in $$I$$ correspond to the most similar sentences in the dataset. For instance, if the results are:
71 | ```
72 | I = [4586, 10252, 12465, 190]
73 | ```
74 | These indices can be used to retrieve the actual sentences that are most similar to the query vector.
75 |
76 | ### Performance Optimization
77 |
78 | FAISS is optimized for performance through various means:
79 |
80 | - **GPU Acceleration**: FAISS can leverage GPU resources to handle large-scale datasets, significantly speeding up both indexing and searching processes.
81 |
82 | - **Parameter Tuning**: Users can adjust parameters such as the number of clusters ($$nlist$$) and the number of probes ($$nprobe$$) to balance between speed and accuracy. For example, increasing $$nprobe$$ allows for more clusters to be searched, potentially improving accuracy at the cost of speed.
83 |
84 | - **Memory Management**: By using quantization techniques like PQ, FAISS reduces memory usage, allowing it to handle larger datasets efficiently.
85 |
86 | By combining these techniques, FAISS achieves state-of-the-art performance in similarity search, making it suitable for applications such as semantic search, recommendation systems, and more.
87 |
88 | Citations:
89 | [1] https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
90 | [2] https://unfoldai.com/effortless-large-scale-image-retrieval-with-faiss-a-hands-on-tutorial/
91 | [3] https://www.youtube.com/watch?v=sKyvsdEv6rk
92 | [4] https://www.pinecone.io/learn/series/faiss/faiss-tutorial/
93 | [5] https://ai.meta.com/tools/faiss/
94 |
--------------------------------------------------------------------------------
/GenerativeAI/4. Transformers and Vector DB Interview prep.md:
--------------------------------------------------------------------------------
1 | Here are 30 key questions and answers to help you prepare for a Generative AI interview, with a focus on Transformer architectures and vector databases:
2 |
3 | ## Transformer Architectures
4 |
5 | 1. **What are the key components of a Transformer architecture?**
6 | - Encoder and decoder layers
7 | - Attention mechanisms
8 | - Feed-forward neural networks
9 | - Layer normalization and residual connections
10 |
11 | 2. **How does the attention mechanism work in Transformers?**
12 | - Computes a weighted sum of values based on the compatibility between keys and queries
13 | - Allows the model to focus on relevant parts of the input sequence
14 | - Enables capturing long-range dependencies without relying on recurrence or convolutions
15 |
16 | 3. **What are the advantages of using Transformer architectures compared to RNNs and CNNs?**
17 | - Parallelization of computations
18 | - Ability to capture long-range dependencies
19 | - Improved performance on tasks like machine translation and language understanding
20 |
21 | 4. **Can you explain the concept of self-attention in Transformers?**
22 | - Attention mechanism applied to the same sequence
23 | - Allows the model to attend to different positions within the same sequence
24 | - Helps capture contextual information within a sequence
25 |
26 | 5. **How do Transformer architectures handle variable-length input sequences?**
27 | - Use of padding tokens and masking techniques
28 | - Padding is added to ensure all sequences have the same length
29 | - Masking is applied to ignore the contributions of padding tokens during attention computations
30 |
31 | 6. **What are the differences between encoder-only, decoder-only, and encoder-decoder Transformer architectures?**
32 | - Encoder-only: Used for tasks like language understanding (e.g., BERT)
33 | - Decoder-only: Used for autoregressive tasks like language generation (e.g., GPT)
34 | - Encoder-decoder: Used for sequence-to-sequence tasks like machine translation (e.g., Transformer)
35 |
36 | 7. **Can you explain the concept of positional encoding in Transformer architectures?**
37 | - Injects positional information into the input embeddings
38 | - Enables the model to understand the relative or absolute positions of tokens in the sequence
39 | - Common techniques include sinusoidal positional encoding and learned positional embeddings
40 |
41 | 8. **How do Transformer architectures handle long-range dependencies compared to RNNs and CNNs?**
42 | - Attention mechanisms allow for direct connections between distant tokens
43 | - Reduces the path length between related tokens
44 | - Enables better modeling of long-range dependencies
45 |
46 | 9. **What are the challenges and limitations of Transformer architectures?**
47 | - Quadratic complexity of attention with respect to sequence length
48 | - Memory and computational requirements can be high for long sequences
49 | - Potential for overfitting due to lack of inductive biases present in RNNs and CNNs
50 |
51 | 10. **Can you discuss some recent advancements and variants of Transformer architectures?**
52 | - Sparse Transformer: Reduces computational complexity by using sparse attention patterns
53 | - Reformer: Uses locality-sensitive hashing to efficiently compute attention
54 | - Longform Transformer: Designed for long-form text generation tasks
55 |
56 | ## Vector Databases
57 |
58 | 11. **What are vector databases, and how do they differ from traditional databases?**
59 | - Store data in the form of high-dimensional vectors
60 | - Optimized for similarity search and nearest neighbor retrieval
61 | - Differ from traditional databases in terms of data structure and query types
62 |
63 | 12. **What are the main use cases of vector databases in Generative AI?**
64 | - Semantic search and retrieval of relevant information for generation tasks
65 | - Storage and indexing of embeddings generated by Generative AI models
66 | - Efficient retrieval of similar examples for few-shot learning and prompting
67 |
68 | 13. **Can you explain the concept of approximate nearest neighbor (ANN) search in vector databases?**
69 | - Aims to find the closest vectors to a given query vector
70 | - Employs techniques like locality-sensitive hashing (LSH) and graph-based methods
71 | - Provides a trade-off between search accuracy and computational efficiency
72 |
73 | 14. **How do vector databases handle high-dimensional data?**
74 | - Use specialized index structures like HNSW (Hierarchical Navigable Small World) graphs
75 | - Leverage dimensionality reduction techniques like PCA or t-SNE
76 | - Optimize for efficient storage and retrieval of high-dimensional vectors
77 |
78 | 15. **What are some popular vector database systems used in Generative AI?**
79 | - Pinecone: Offers a managed vector database service with support for ANN search
80 | - Milvus: An open-source vector database with a focus on scalability and performance
81 | - Weaviate: Combines vector search with a GraphQL API for easy integration
82 |
83 | 16. **Can you discuss the role of vector databases in few-shot learning and prompting for Generative AI?**
84 | - Store relevant examples or prompts as vectors
85 | - Retrieve similar examples based on the input prompt or context
86 | - Provide additional information or guidance to the Generative AI model
87 |
88 | 17. **How do vector databases enable efficient retrieval of relevant information for generation tasks?**
89 | - Store generated outputs or relevant information as vectors
90 | - Perform similarity search to find the most relevant vectors based on the input
91 | - Retrieve the corresponding information to guide or enhance the generation process
92 |
93 | 18. **What are some challenges and limitations of using vector databases in Generative AI?**
94 | - Handling dynamic updates and changes to the stored vectors
95 | - Ensuring data privacy and security when storing sensitive information
96 | - Balancing the trade-off between search accuracy and computational efficiency
97 |
98 | 19. **Can you discuss the integration of vector databases with Generative AI models?**
99 | - Seamless integration through APIs or query languages
100 | - Ability to perform vector search and retrieval within the Generative AI pipeline
101 | - Enables end-to-end solutions for tasks like question-answering and dialogue generation
102 |
103 | 20. **What are some future trends and advancements in vector databases for Generative AI?**
104 | - Improved scalability and performance for handling large-scale datasets
105 | - Incorporation of deep learning techniques for better similarity search
106 | - Integration with other AI technologies like knowledge graphs and reasoning engines
107 |
108 | ## Generative AI Fundamentals
109 |
110 | 21. **What are the key differences between discriminative and generative models in machine learning?**
111 | - Discriminative models learn the decision boundary between classes
112 | - Generative models learn the underlying data distribution to generate new samples
113 |
114 | 22. **Can you explain the concept of latent space in generative models?**
115 | - Represents a lower-dimensional space where the model encodes data features
116 | - Enables manipulation of these features to generate new, meaningful samples
117 |
118 | 23. **What are some common evaluation metrics used for assessing the quality of generated samples?**
119 | - Inception Score (IS): Measures the quality and diversity of generated samples
120 | - Fréchet Inception Distance (FID): Compares the statistics of generated samples with real samples
121 | - Human evaluation: Relies on subjective assessments by human judges
122 |
123 | 24. **How do you handle mode collapse in Generative Adversarial Networks (GANs)?**
124 | - Use techniques like mini-batch discrimination and spectral normalization
125 | - Incorporate different loss functions like WGAN-GP
126 | - Employ data augmentation strategies to increase the diversity of training samples
127 |
128 | 25. **Can you discuss the role of Generative AI in few-shot learning and prompt engineering?**
129 | - Generative models can generate relevant examples or prompts for few-shot learning
130 | - Prompts can guide the model to generate outputs that align with specific instructions or styles
131 | - Enables efficient learning from limited data and customization of generated outputs
132 |
133 | 26. **What are some ethical considerations when deploying Generative AI systems?**
134 | - Potential for generating biased or harmful content
135 | - Ensuring transparency and accountability in the decision-making process
136 | - Addressing issues related to data privacy and security
137 |
138 | 27. **How do you assess the quality and consistency of generated outputs from Generative AI models?**
139 | - Employ techniques like hallucination detection and factual consistency checking
140 | - Utilize tools like SelfCheckGPT and G-EVAL for evaluating the quality of generated text
141 | - Combine different evaluation methods to provide a comprehensive assessment
142 |
143 | 28. **Can you discuss the role of Generative AI in multimodal learning and generation?**
144 | - Integrate information from multiple modalities like text, images, and audio
145 | - Leverage cross-modal attention mechanisms to capture interactions between modalities
146 | - Enable generation of outputs in one modality conditioned on inputs from other modalities
147 |
148 | 29. **What are some common challenges and limitations in training Generative AI models?**
149 | - Unstable training dynamics and mode collapse in GANs
150 | - Difficulty in scaling up to high-resolution image generation or long-form text generation
151 | - Ensuring consistent and coherent generation across multiple steps or iterations
152 |
153 | 30. **Can you discuss the potential future advancements and applications of Generative AI?**
154 | - Continued improvements in generation quality and diversity
155 | - Expansion to new domains like video, 3D objects, and interactive environments
156 | - Integration with other AI technologies like reasoning engines and knowledge bases
157 | - Widespread adoption in various industries like entertainment, healthcare, and education
158 |
159 | Remember to tailor your answers based on your specific experience and knowledge, and feel free to explore additional resources to deepen your understanding of Transformer architectures and vector databases in the context of Generative AI.
160 |
161 | Citations:
162 | [1] https://www.usebraintrust.com/hire/interview-questions/generative-ai-specialists
163 | [2] https://www.reddit.com/r/MachineLearning/comments/17u7b19/d_genaillm_interview_prep/
164 | [3] https://github.com/aishwaryanr/awesome-generative-ai-guide/blob/main/interview_prep/60_gen_ai_questions.md
165 | [4] https://incubity.ambilio.com/top-25-generative-ai-interview-questions-with-answers/
166 | [5] https://www.youtube.com/watch?v=F1lsFTpsQLI
167 |
--------------------------------------------------------------------------------
/GenerativeAI/5. FAISS interview prep.md:
--------------------------------------------------------------------------------
1 | Here are 30 interview questions and answers focused on FAISS (Facebook AI Similarity Search) and its applications in Generative AI, covering various scenarios you might encounter:
2 |
3 | ## FAISS and Its Applications
4 |
5 | 1. **What is FAISS and what are its primary uses?**
6 | - FAISS is a library developed by Facebook AI Research designed for efficient similarity search and clustering of dense vectors. It is primarily used for tasks like nearest neighbor search in high-dimensional spaces, which is essential in applications such as image retrieval, recommendation systems, and natural language processing.
7 |
8 | 2. **How does FAISS handle high-dimensional data?**
9 | - FAISS employs various indexing structures, such as inverted file systems and product quantization, to efficiently manage high-dimensional data. These structures allow for fast approximate nearest neighbor searches while reducing memory usage.
10 |
11 | 3. **What are the different types of indexes available in FAISS?**
12 | - FAISS provides several index types, including:
13 | - Flat Index: Exact nearest neighbor search.
14 | - IVFFlat: Inverted file index with flat quantization for approximate search.
15 | - HNSW: Hierarchical Navigable Small World graph for efficient approximate searches.
16 | - PQ (Product Quantization): Reduces the dimensionality of vectors for faster searches.
17 |
18 | 4. **Can you explain the concept of approximate nearest neighbor (ANN) search in FAISS?**
19 | - ANN search in FAISS aims to find the closest vectors to a query vector quickly without exhaustively comparing all vectors. It uses techniques like clustering and quantization to limit the search space, trading off some accuracy for speed.
20 |
21 | 5. **What are the advantages of using FAISS over other vector search libraries?**
22 | - FAISS is optimized for performance, scalability, and flexibility. It supports large datasets, provides various indexing methods, and is designed to work efficiently on both CPUs and GPUs, making it suitable for high-performance applications.
23 |
24 | 6. **How do you optimize FAISS for large-scale datasets?**
25 | - To optimize FAISS for large datasets, you can:
26 | - Use appropriate index types like IVFPQ or HNSW for faster searches.
27 | - Leverage GPU acceleration for computation-heavy tasks.
28 | - Fine-tune parameters like the number of clusters and quantization levels based on your data characteristics.
29 |
30 | 7. **What is the role of vector embeddings in FAISS?**
31 | - Vector embeddings represent data points in a high-dimensional space, capturing their semantic meanings. In FAISS, these embeddings are used to perform similarity searches, allowing the retrieval of similar items based on their vector representations.
32 |
33 | 8. **Can you describe a scenario where you used FAISS in a project?**
34 | - In a project for an e-commerce platform, I implemented FAISS to enhance the product recommendation system. By indexing product embeddings generated from user interactions, we achieved real-time recommendations based on user preferences, significantly improving user engagement.
35 |
36 | 9. **What challenges did you face while implementing FAISS, and how did you overcome them?**
37 | - One challenge was managing memory usage with large datasets. I addressed this by using product quantization to reduce the memory footprint of the embeddings while maintaining reasonable search accuracy.
38 |
39 | 10. **How does FAISS compare to traditional databases for similarity search?**
40 | - Unlike traditional databases that focus on exact matches and structured queries, FAISS is optimized for high-dimensional vector similarity searches, allowing for approximate matches that are crucial in AI applications like image and text retrieval.
41 |
42 | 11. **What are the typical preprocessing steps before using FAISS?**
43 | - Typical preprocessing steps include:
44 | - Normalizing the vectors to ensure consistent distances.
45 | - Reducing dimensionality if necessary, using techniques like PCA.
46 | - Ensuring that the data is in the correct format for FAISS indexing.
47 |
48 | 12. **How do you evaluate the performance of a FAISS index?**
49 | - Performance can be evaluated using metrics such as:
50 | - Recall: The fraction of relevant items retrieved.
51 | - Precision: The fraction of retrieved items that are relevant.
52 | - Latency: The time taken to perform searches.
53 |
54 | 13. **What is the significance of the `nlist` parameter in FAISS?**
55 | - The `nlist` parameter defines the number of clusters in an inverted file index. A higher `nlist` can improve recall but may increase search time and memory usage. Tuning this parameter is crucial for balancing performance and resource usage.
56 |
57 | 14. **How can FAISS be integrated with machine learning models?**
58 | - FAISS can be integrated with machine learning models by using it to index embeddings generated by those models. For example, after training a neural network to generate embeddings for images, FAISS can be used to perform similarity searches among those embeddings.
59 |
60 | 15. **What is the role of quantization in FAISS?**
61 | - Quantization reduces the precision of vector representations to decrease memory usage and speed up searches. FAISS supports various quantization techniques, such as scalar quantization and product quantization, to optimize performance.
62 |
63 | 16. **Can you explain the concept of "inverted file" indexing in FAISS?**
64 | - Inverted file indexing groups vectors into clusters and maintains a list of vectors for each cluster. This allows FAISS to quickly narrow down the search to a subset of vectors, significantly speeding up the nearest neighbor search process.
65 |
66 | 17. **How do you handle updates to the dataset in FAISS?**
67 | - FAISS allows for dynamic updates by adding or removing vectors from the index. However, for large-scale updates, it may be more efficient to rebuild the index periodically rather than updating it incrementally.
68 |
69 | 18. **What are some common pitfalls when using FAISS?**
70 | - Common pitfalls include:
71 | - Not normalizing vectors, which can lead to inaccurate distance calculations.
72 | - Using inappropriate index types for the data size and search requirements.
73 | - Failing to tune parameters like `nlist` and `nprobe` for optimal performance.
74 |
75 | 19. **How does FAISS support GPU acceleration?**
76 | - FAISS provides a GPU module that allows for the indexing and searching of vectors on NVIDIA GPUs. This significantly speeds up operations, especially for large datasets and complex queries.
77 |
78 | 20. **What is the `nprobe` parameter in FAISS, and how does it affect search results?**
79 | - The `nprobe` parameter determines the number of clusters to search during a query. A higher `nprobe` increases the chances of finding relevant results but also increases search time. Tuning this parameter is essential for balancing speed and accuracy.
80 |
81 | 21. **How can you use FAISS for clustering tasks?**
82 | - FAISS can be used for clustering by applying algorithms like k-means on the vector embeddings. Once clusters are formed, FAISS can efficiently retrieve points belonging to specific clusters or find nearest neighbors within those clusters.
83 |
84 | 22. **What are the trade-offs between using exact and approximate search in FAISS?**
85 | - Exact search guarantees the most accurate results but is computationally expensive and slow for large datasets. Approximate search is faster and uses less memory but may sacrifice some accuracy, making it suitable for real-time applications.
86 |
87 | 23. **Can FAISS be used for text similarity search? If so, how?**
88 | - Yes, FAISS can be used for text similarity search by converting text into embeddings using models like BERT or Sentence Transformers. These embeddings can then be indexed in FAISS for efficient similarity searches.
89 |
90 | 24. **How would you implement a recommendation system using FAISS?**
91 | - To implement a recommendation system:
92 | - Generate embeddings for items and users.
93 | - Index these embeddings using FAISS.
94 | - For a given user, retrieve similar items based on their embedding using FAISS's nearest neighbor search.
95 |
96 | 25. **What is the role of the `metric` parameter in FAISS?**
97 | - The `metric` parameter defines the distance metric used for similarity calculations, such as L2 (Euclidean) or inner product. The choice of metric can significantly affect the search results and should align with the data characteristics.
98 |
99 | 26. **How do you ensure the scalability of FAISS in production environments?**
100 | - Scalability can be ensured by:
101 | - Using distributed computing frameworks to handle large datasets.
102 | - Optimizing index parameters based on the expected load and query patterns.
103 | - Regularly monitoring performance and adjusting configurations as needed.
104 |
105 | 27. **What are some best practices for using FAISS effectively?**
106 | - Best practices include:
107 | - Regularly profiling and benchmarking index performance.
108 | - Experimenting with different index types and parameters.
109 | - Keeping the vector space normalized and well-structured for optimal searches.
110 |
111 | 28. **How do you manage the trade-off between accuracy and speed in FAISS?**
112 | - Manage this trade-off by tuning parameters like `nlist`, `nprobe`, and choosing the right index type based on the specific application requirements. Regular testing and validation against real-world queries can help find the right balance.
113 |
114 | 29. **Can you discuss a specific feature of FAISS that you find particularly useful?**
115 | - One particularly useful feature is the ability to perform multi-threaded searches, which significantly speeds up retrieval times, especially when handling large datasets in real-time applications.
116 |
117 | 30. **What future developments do you foresee for FAISS and vector search technologies?**
118 | - Future developments may include enhanced support for hybrid search combining structured and unstructured data, improved algorithms for dynamic indexing, and better integration with deep learning frameworks for real-time applications.
119 |
120 | These questions and answers should help you prepare effectively for your interview related to FAISS and its applications in Generative AI.
121 |
122 | Citations:
123 | [1] https://incubity.ambilio.com/top-25-generative-ai-interview-questions-with-answers/
124 | [2] https://www.youtube.com/watch?v=F1lsFTpsQLI
125 | [3] https://www.usebraintrust.com/hire/interview-questions/generative-ai-specialists
126 | [4] https://www.reddit.com/r/MachineLearning/comments/17u7b19/d_genaillm_interview_prep/
127 | [5] https://blog.streamlit.io/ai-interviewer-customized-interview-preparation-with-generative-ai/
128 |
--------------------------------------------------------------------------------
/GenerativeAI/6.RecursiveReferenceRAG.md:
--------------------------------------------------------------------------------
1 | Sure, here's an example implementation using Python and Langchain to handle document references in a RAG architecture:
2 |
3 | ```python
4 | from langchain.document_loaders import TextLoader
5 | from langchain.embeddings import HuggingFaceEmbeddings
6 | from langchain.vectorstores import Chroma
7 | from langchain.chains import RetrievalQA
8 | from langchain.llms import HuggingFaceHub
9 |
10 | class DocumentReferenceRAG:
11 | def __init__(self, documents):
12 | self.documents = documents
13 | self.embeddings = HuggingFaceEmbeddings()
14 | self.vectorstore = Chroma.from_documents(self.documents, self.embeddings)
15 | self.llm = HuggingFaceHub(repo_id="google/flan-t5-xl")
16 | self.qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", retriever=self.vectorstore.as_retriever())
17 |
18 | def answer_question(self, question, max_recursion_depth=3):
19 | return self._recursive_answer(question, max_recursion_depth)
20 |
21 | def _recursive_answer(self, question, max_recursion_depth, processed_docs=None):
22 | if processed_docs is None:
23 | processed_docs = set()
24 |
25 | result = self.qa.run(question)
26 | processed_docs.add(result.source_documents[0].metadata['source'])
27 |
28 | for doc in result.source_documents:
29 | if 'referenced_docs' in doc.metadata:
30 | for ref_doc_link in doc.metadata['referenced_docs']:
31 | if ref_doc_link not in processed_docs and max_recursion_depth > 0:
32 | ref_doc = self._retrieve_document(ref_doc_link)
33 | if ref_doc:
34 | self.documents.append(ref_doc)
35 | self.vectorstore = Chroma.from_documents(self.documents, self.embeddings)
36 | self.qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", retriever=self.vectorstore.as_retriever())
37 | result = self._recursive_answer(question, max_recursion_depth - 1, processed_docs)
38 | break
39 |
40 | return result
41 |
42 | def _retrieve_document(self, doc_link):
43 | # Implement document retrieval logic based on the provided link
44 | # For example, load the document from a file or database
45 | loader = TextLoader(doc_link)
46 | return loader.load()[0]
47 |
48 | # Example usage
49 | doc1 = TextLoader('doc1.txt').load()[0]
50 | doc2 = TextLoader('doc2.txt').load()[0]
51 | doc3 = TextLoader('doc3.txt').load()[0]
52 | doc4 = TextLoader('doc4.txt').load()[0]
53 | doc5 = TextLoader('doc5.txt').load()[0]
54 |
55 | rag = DocumentReferenceRAG([doc1, doc2, doc3, doc4, doc5])
56 | question = "What is the relationship between document 1 and document 3?"
57 | answer = rag.answer_question(question)
58 | print(answer)
59 | ```
60 |
61 | In this example:
62 |
63 | 1. The `DocumentReferenceRAG` class is defined to handle the recursive retrieval and processing of documents.
64 |
65 | 2. The `__init__` method initializes the necessary components:
66 | - Loads the initial set of documents
67 | - Creates document embeddings using HuggingFaceEmbeddings
68 | - Stores the documents in a Chroma vector store
69 | - Sets up the LLM (HuggingFaceHub) and RetrievalQA chain
70 |
71 | 3. The `answer_question` method takes a question and an optional maximum recursion depth. It calls the `_recursive_answer` method to generate the answer.
72 |
73 | 4. The `_recursive_answer` method implements the recursive retrieval process:
74 | - Generates an initial answer using the RetrievalQA chain
75 | - Checks if the generated answer references any other documents
76 | - If referenced documents are found, recursively retrieves them using the `_retrieve_document` method
77 | - Appends the retrieved documents to the document collection and updates the vector store and RetrievalQA chain
78 | - Repeats the process until no new referenced documents are found or the maximum recursion depth is reached
79 |
80 | 5. The `_retrieve_document` method is a placeholder for the actual document retrieval logic. In this example, it loads the document from a file using the `TextLoader` from Langchain.
81 |
82 | 6. In the example usage, five documents are loaded, and the `DocumentReferenceRAG` class is instantiated with these documents.
83 |
84 | 7. A question is asked, and the `answer_question` method is called to generate the final answer, considering the referenced documents.
85 |
86 | This implementation demonstrates how to extend a RAG architecture to handle document references using Langchain. The recursive retrieval process ensures that all relevant documents are considered when answering questions, even if they are referenced within other documents.
87 |
88 | Remember to customize the `_retrieve_document` method to match your specific document storage and retrieval mechanism. Additionally, you may want to add more error handling and optimization techniques based on your requirements.
89 |
--------------------------------------------------------------------------------
/GenerativeAI/References.md:
--------------------------------------------------------------------------------
1 | FAISS Similarity Search
2 | https://www.youtube.com/playlist?list=PLIUOU7oqGTLhlWpTz4NnuT3FekouIVlqc
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Amogh Singhal
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/adjacentElementProduct.py:
--------------------------------------------------------------------------------
1 | # Given an array of integers, find the pair of adjacent elements
2 | # that has the largest product and return that product.
3 |
4 | # Approach 1: (Brute Force) - Check all the pairs in the list and then return the maximum pair
5 | # Time Complexity: O(N^2)
6 |
7 | def adjacentElementProductBF(inputArray):
8 | largestProduct = -999999
9 |
10 | # for sanity check, assert if array contains at least 2 elements
11 | if len(inputArray) < 2:
12 | print("No pairs exists")
13 | return -1
14 |
15 | for i in range(0, len(inputArray)):
16 | for j in range(i+1, len(inputArray)):
17 | currentProduct = inputArray[i]*inputArray[j]
18 |
19 | if currentProduct > largestProduct:
20 | largestProduct = currentProduct
21 |
22 | return largestProduct
23 |
24 | # Approach 2: (Sort & Pick Last Pair) - Sort the list and then pick the last two numbers
25 | # Caveat: All elements must be positive
26 | # Time Complexity: O(Nlog(N))
27 |
28 | def adjacentElementsProductSort(inputArray):
29 | size = len(inputArray)
30 |
31 | if size < 2:
32 | print("No Pairs exist")
33 | return -1
34 |
35 | sortedArray = sorted(inputArray)
36 | return sortedArray[-1] * sortedArray[-2]
37 |
38 |
39 | def adjacentElementsProduct(inputArray):
40 |
41 | length = int(len(inputArray))
42 |
43 | maxm = inputArray[0]*inputArray[1]
44 | product = 1
45 | for i in range(1, length-1):
46 | product = inputArray[i]*inputArray[i+1]
47 |
48 | if product>maxm:
49 | maxm = product
50 |
51 | return maxm
52 |
53 |
54 | # print(adjacentElementsProduct([3,6,7,5]))
55 |
56 | print(adjacentElementsProduct([3, 6, -2, -5, 7, 3]))
57 |
58 | #Alternate solution
59 | #return max([inputArray[i]*inputArray[i+1] for i in range(0, int(len(inputArray)-1))])
60 |
--------------------------------------------------------------------------------
/atoi.py:
--------------------------------------------------------------------------------
1 | # Convert the string "123" into 123, without using the built-api `int()`
2 |
3 | # Startegy
4 | # 1. loop through each digit
5 | # 2. find the digit in range object of range(10)
6 | # 3. once the number is found, add it in placeholder
7 | # 4. Multiply each iteration by 10 (start with 0)
8 |
9 | def atoi(inputStr):
10 | outputNum = 0
11 | for char in inputStr:
12 | for i in range(10):
13 | if str(i) == char:
14 | outputNum = outputNum * 10 + i
15 | return outputNum
16 |
17 | x = "123"
18 | y = atoi(x)
19 | print(y)
20 |
--------------------------------------------------------------------------------
/binary_search_recursive.py:
--------------------------------------------------------------------------------
1 | # Given an array , find if the number exists
2 | # This is a `recursive` implementation of the
3 | # binary search. If the element is not found
4 | # it returns -1
5 |
6 |
7 | def binarySearch(lst, key, l, r):
8 | if r >= l:
9 | mid = l + (r - l) // 2
10 | # use print(l, mid, r) to view the process
11 | if lst[mid] == key:
12 | return mid
13 | elif lst[mid] < key:
14 | return binarySearch(lst, key, mid + 1, r)
15 | elif lst[mid] > key:
16 | return binarySearch(lst, key, l, mid - 1)
17 | else:
18 | return -1
19 |
20 |
21 | arr = [int(i) for i in range(101)]
22 | print(binarySearch(arr, 67, 0, len(arr) - 1))
23 |
--------------------------------------------------------------------------------
/bits_wilp/Ex2_Numpy_Q1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "846061b9-4829-48a0-a2be-dae592b8f95a",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "0901b4de-dd28-4433-ac7b-8db0d9a0f995",
17 | "metadata": {},
18 | "source": [
19 | "### Size of Numpy array in bytes"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "id": "2cec0596-d393-479b-bf67-3deddec0ea9e",
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdin",
30 | "output_type": "stream",
31 | "text": [
32 | "Enter integers seperated by space. Press ENTER to end... 34 23 67 89\n"
33 | ]
34 | },
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "Given sequence is\n",
40 | "[34, 23, 67, 89]\n",
41 | "Number of elements in the numpy array: 4\n",
42 | "Total bytes consumed by the numpy array: 16\n",
43 | "Size in bytes of each element in the numpy array: 4\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "try:\n",
49 | " # feed integer array from user\n",
50 | " arr = list(map(int, input(\"Enter integers seperated by space. Press ENTER to end...\").split()))\n",
51 | " print(\"Given sequence is\")\n",
52 | " print(arr)\n",
53 | " # convert Python array to Numpy array\n",
54 | " np_array = np.array(arr, dtype=int)\n",
55 | " print(f\"Number of elements in the numpy array: {np_array.size}\")\n",
56 | " print(f\"Total bytes consumed by the numpy array: {np_array.nbytes}\")\n",
57 | " print(f\"Size in bytes of each element in the numpy array: {(np_array.nbytes)//(np_array.size)}\")\n",
58 | "except ValueError as e:\n",
59 | " print(\"ERROR: Please enter only integers !!!\")\n",
60 | " print(e)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "id": "3e18b28b-796a-428f-96f7-cd530fd0cfd1",
67 | "metadata": {},
68 | "outputs": [],
69 | "source": []
70 | }
71 | ],
72 | "metadata": {
73 | "kernelspec": {
74 | "display_name": "Python 3",
75 | "language": "python",
76 | "name": "python3"
77 | },
78 | "language_info": {
79 | "codemirror_mode": {
80 | "name": "ipython",
81 | "version": 3
82 | },
83 | "file_extension": ".py",
84 | "mimetype": "text/x-python",
85 | "name": "python",
86 | "nbconvert_exporter": "python",
87 | "pygments_lexer": "ipython3",
88 | "version": "3.8.5"
89 | }
90 | },
91 | "nbformat": 4,
92 | "nbformat_minor": 5
93 | }
94 |
--------------------------------------------------------------------------------
/bits_wilp/Ex2_Numpy_Q2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "e572d5e4-94a8-4059-bc1f-332e713b02e3",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "b525ea5a-5f30-40ac-a98c-40394d788bba",
17 | "metadata": {},
18 | "source": [
19 | "### Set Difference Between Two Nupmy Arrays"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 7,
25 | "id": "bdb7936b-b56f-458e-9e72-70eeb7968190",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "# Function to take input array from user\n",
30 | "def np_array_frm_input(prompt):\n",
31 | " try:\n",
32 | " arr = list(map(int, input(prompt).split()))\n",
33 | " np_array = np.array(arr, dtype=int)\n",
34 | " except ValueError as e:\n",
35 | " np_array = None\n",
36 | " print(\"ERROR: Please enter only integers !!!\")\n",
37 | " print(e)\n",
38 | " return np_array"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 17,
44 | "id": "1654ed7a-63f9-44bf-a30e-ea85ecd806fd",
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdin",
49 | "output_type": "stream",
50 | "text": [
51 | "\n",
52 | "Enter numbers for first sequence: 6 4 2\n",
53 | "\n",
54 | "Enter numbers for second sequence: 3 4\n"
55 | ]
56 | },
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "\n",
62 | "The set difference between [6 4 2] and [3 4] is [2 6]\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "# Return the sorted, unique values in array1 that are not in array2\n",
68 | "np_arr1 = np_array_frm_input(\"\\nEnter numbers for first sequence: \")\n",
69 | "np_arr2 = np_array_frm_input(\"\\nEnter numbers for second sequence: \")\n",
70 | "print(f\"\\nThe set difference between {np_arr1} and {np_arr2} is {np.setdiff1d(np_arr1, np_arr2)}\")"
71 | ]
72 | }
73 | ],
74 | "metadata": {
75 | "kernelspec": {
76 | "display_name": "Python 3",
77 | "language": "python",
78 | "name": "python3"
79 | },
80 | "language_info": {
81 | "codemirror_mode": {
82 | "name": "ipython",
83 | "version": 3
84 | },
85 | "file_extension": ".py",
86 | "mimetype": "text/x-python",
87 | "name": "python",
88 | "nbconvert_exporter": "python",
89 | "pygments_lexer": "ipython3",
90 | "version": "3.8.5"
91 | }
92 | },
93 | "nbformat": 4,
94 | "nbformat_minor": 5
95 | }
96 |
--------------------------------------------------------------------------------
/bits_wilp/Ex2_Numpy_Q3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "3b75f73e-6d1b-4a9e-935d-4a3d229bcc6a",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "65b9b866-0960-40d1-8deb-4ebe1261d420",
17 | "metadata": {},
18 | "source": [
19 | "### Cross Product of two given vectors"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "id": "1a98a54e-7a31-4575-84ec-c72c4aadb8a4",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "# Function to take input array from user\n",
30 | "def np_array_frm_input(prompt):\n",
31 | " try:\n",
32 | " arr = list(map(int, input(prompt).split()))\n",
33 | " np_array = np.array(arr, dtype=int)\n",
34 | " except ValueError as e:\n",
35 | " np_array = None\n",
36 | " print(\"ERROR: Please enter only integers !!!\")\n",
37 | " print(e)\n",
38 | " return np_array"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 4,
44 | "id": "f540a81d-f8e5-44c1-a280-ac96211ea7c4",
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdin",
49 | "output_type": "stream",
50 | "text": [
51 | "Enter the first integer vector... 3\n",
52 | "Enter the second integer vector... 4 5\n"
53 | ]
54 | },
55 | {
56 | "name": "stdout",
57 | "output_type": "stream",
58 | "text": [
59 | "ERROR: Vectors must be 2D pr 3D for computing cross-product !!!\n",
60 | "incompatible dimensions for cross product\n",
61 | "(dimension must be 2 or 3)\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "try:\n",
67 | " vec1 = np_array_frm_input(\"Enter the first integer vector...\")\n",
68 | " vec2 = np_array_frm_input(\"Enter the second integer vector...\")\n",
69 | " print(f\"The cross product between {vec1} and {vec2} is {np.cross(vec1, vec2)}\")\n",
70 | "except ValueError as e:\n",
71 | " print(\"ERROR: Vectors must be 2D pr 3D for computing cross-product !!!\")\n",
72 | " print(e)"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "id": "a7c915f7-e509-46e9-843a-c891fd2e64a3",
79 | "metadata": {},
80 | "outputs": [],
81 | "source": []
82 | }
83 | ],
84 | "metadata": {
85 | "kernelspec": {
86 | "display_name": "Python 3",
87 | "language": "python",
88 | "name": "python3"
89 | },
90 | "language_info": {
91 | "codemirror_mode": {
92 | "name": "ipython",
93 | "version": 3
94 | },
95 | "file_extension": ".py",
96 | "mimetype": "text/x-python",
97 | "name": "python",
98 | "nbconvert_exporter": "python",
99 | "pygments_lexer": "ipython3",
100 | "version": "3.8.5"
101 | }
102 | },
103 | "nbformat": 4,
104 | "nbformat_minor": 5
105 | }
106 |
--------------------------------------------------------------------------------
/bits_wilp/Ex2_Numpy_Q4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "8ddc2bc7-042f-4552-92a3-f9047a132e56",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "30c3a8f2-797e-4427-9acd-0ea951421e92",
17 | "metadata": {},
18 | "source": [
19 | "### Determinant of a square array"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 22,
25 | "id": "16a039d2-473c-4b23-8d08-108b0edae19b",
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdin",
30 | "output_type": "stream",
31 | "text": [
32 | "Enter size of square array: 2\n",
33 | "Enter a square array in row-wise manner: 3 4 2 2\n"
34 | ]
35 | },
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "\n",
41 | "SUCCESS: Reshape operation completed\n",
42 | "Given square array is\n",
43 | "[[3 4]\n",
44 | " [2 2]]\n",
45 | "\n",
46 | "The approx. determinant of the above square array is -2.0\n",
47 | "The absolute determinant of the above square array is -1.9999999999999998\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "arr_shape = int(input(\"Enter size of square array: \"))\n",
53 | "arr = list(map(int,input(\"Enter a square array in row-wise manner: \").split()))\n",
54 | "\n",
55 | "if len(arr) == arr_shape**2 :\n",
56 | " sq_arr = np.array(arr, dtype=int).reshape(arr_shape, arr_shape)\n",
57 | " print(\"\\nSUCCESS: Reshape operation completed\")\n",
58 | " print(f\"Given square array is\")\n",
59 | " print(sq_arr)\n",
60 | " print(f\"\\nThe approx. determinant of the above square array is {round(np.linalg.det(sq_arr),0)}\")\n",
61 | " print(f\"The absolute determinant of the above square array is {np.linalg.det(sq_arr)}\")\n",
62 | "else:\n",
63 | " print(f\"ERROR: Cannot reshape array of size {len(arr)} into {(arr_shape, arr_shape)}\")"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "id": "35643ed3-5bfe-443a-adf0-df266ba9da72",
70 | "metadata": {},
71 | "outputs": [],
72 | "source": []
73 | }
74 | ],
75 | "metadata": {
76 | "kernelspec": {
77 | "display_name": "Python 3",
78 | "language": "python",
79 | "name": "python3"
80 | },
81 | "language_info": {
82 | "codemirror_mode": {
83 | "name": "ipython",
84 | "version": 3
85 | },
86 | "file_extension": ".py",
87 | "mimetype": "text/x-python",
88 | "name": "python",
89 | "nbconvert_exporter": "python",
90 | "pygments_lexer": "ipython3",
91 | "version": "3.8.5"
92 | }
93 | },
94 | "nbformat": 4,
95 | "nbformat_minor": 5
96 | }
97 |
--------------------------------------------------------------------------------
/bits_wilp/Ex2_Numpy_Q5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "678ce984-841d-49bd-b2c5-8586aaed05b2",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "c8866271-04d5-4bd6-9916-f55bc5889968",
17 | "metadata": {},
18 | "source": [
19 | "### Eigenvalues and Eigenvectors of a square array"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 3,
25 | "id": "346bcf67-e5be-49d2-8e22-31d87ed416cd",
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdin",
30 | "output_type": "stream",
31 | "text": [
32 | "Enter size of square array: 2\n",
33 | "Enter a square array in row-wise manner: 1 2 2 1\n"
34 | ]
35 | },
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "\n",
41 | "SUCCESS: Reshape operation completed\n",
42 | "Given square array is\n",
43 | "[[1 2]\n",
44 | " [2 1]]\n",
45 | "The eigenvalues of the above square array is\n",
46 | "[ 3. -1.]\n",
47 | "The eigenvectors of the above square array is\n",
48 | "[[ 0.70710678 -0.70710678]\n",
49 | " [ 0.70710678 0.70710678]]\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "arr_shape = int(input(\"Enter size of square array: \"))\n",
55 | "arr = list(map(int,input(\"Enter a square array in row-wise manner: \").split()))\n",
56 | "\n",
57 | "if len(arr) == arr_shape**2 :\n",
58 | " sq_arr = np.array(arr, dtype=int).reshape(arr_shape, arr_shape)\n",
59 | " print(\"\\nSUCCESS: Reshape operation completed\")\n",
60 | " print(f\"Given square array is\")\n",
61 | " print(sq_arr)\n",
62 | " \n",
63 | " eig_val, eig_vec = np.linalg.eig(sq_arr)\n",
64 | " print(\"The eigenvalues of the above square array is\")\n",
65 | " print(eig_val)\n",
66 | " print(\"The eigenvectors of the above square array is\")\n",
67 | " print(eig_vec)\n",
68 | "else:\n",
69 | " print(f\"ERROR: Cannot reshape array of size {len(arr)} into {(arr_shape, arr_shape)}\")"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "id": "897a5000-e52d-4a07-afc1-2b722ec423e5",
76 | "metadata": {},
77 | "outputs": [],
78 | "source": []
79 | }
80 | ],
81 | "metadata": {
82 | "kernelspec": {
83 | "display_name": "Python 3",
84 | "language": "python",
85 | "name": "python3"
86 | },
87 | "language_info": {
88 | "codemirror_mode": {
89 | "name": "ipython",
90 | "version": 3
91 | },
92 | "file_extension": ".py",
93 | "mimetype": "text/x-python",
94 | "name": "python",
95 | "nbconvert_exporter": "python",
96 | "pygments_lexer": "ipython3",
97 | "version": "3.8.5"
98 | }
99 | },
100 | "nbformat": 4,
101 | "nbformat_minor": 5
102 | }
103 |
--------------------------------------------------------------------------------
/bits_wilp/Ex2_Numpy_Q6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "6fbe566f",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "a0d6b1f1",
17 | "metadata": {},
18 | "source": [
19 | "### Matrix Multiplication"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 14,
25 | "id": "77a499e1",
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "First matrix:\n",
33 | " [[4 3 3]\n",
34 | " [2 4 2]\n",
35 | " [0 1 4]]\n",
36 | "\n",
37 | "Second matrix:\n",
38 | " [[4 8 1]\n",
39 | " [0 2 3]\n",
40 | " [6 6 2]]\n",
41 | "\n",
42 | "Product of the two matrix\n",
43 | "[[34 56 19]\n",
44 | " [20 36 18]\n",
45 | " [24 26 11]]\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "def matgen3d():\n",
51 | " \"\"\"\n",
52 | " Function to generate a random integer 3x3 matrix\n",
53 | " Min value: 1, Max value: 9\n",
54 | " \"\"\"\n",
55 | " return np.random.randint(low=0, high=10, size=(3,3))\n",
56 | "\n",
57 | "mat1 = matgen3d()\n",
58 | "mat2 = matgen3d()\n",
59 | "\n",
60 | "print(\"First matrix:\\n\", mat1)\n",
61 | "print()\n",
62 | "print(\"Second matrix:\\n\", mat2)\n",
63 | "print()\n",
64 | "print(\"Product of the two matrix\")\n",
65 | "print(np.matmul(mat1, mat2))"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "id": "2b277c10",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": []
75 | }
76 | ],
77 | "metadata": {
78 | "kernelspec": {
79 | "display_name": "Python 3",
80 | "language": "python",
81 | "name": "python3"
82 | },
83 | "language_info": {
84 | "codemirror_mode": {
85 | "name": "ipython",
86 | "version": 3
87 | },
88 | "file_extension": ".py",
89 | "mimetype": "text/x-python",
90 | "name": "python",
91 | "nbconvert_exporter": "python",
92 | "pygments_lexer": "ipython3",
93 | "version": "3.8.5"
94 | }
95 | },
96 | "nbformat": 4,
97 | "nbformat_minor": 5
98 | }
99 |
--------------------------------------------------------------------------------
/bits_wilp/Quiz 1_ S2-20_DSECLPFDS.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Python-Interview-Problems-for-Practice/5e2189c50841bb9ed54807986b8dd010590fd9b5/bits_wilp/Quiz 1_ S2-20_DSECLPFDS.pdf
--------------------------------------------------------------------------------
/bits_wilp/binomialCoefficient.py:
--------------------------------------------------------------------------------
1 | def factorial(n):
2 | if n < 2:
3 | return 1
4 | else:
5 | return n * factorial(n-1)
6 |
7 |
8 | n = int(input("Enter the value of n: "))
9 | r = int(input("Enter the value of r: "))
10 |
11 | # int division to avoid `float` output
12 | ncr = factorial(n) // (factorial(r) * factorial(n-r))
13 |
14 | # string formatting
15 | result = "The binomial coefficient for {n} and {r} is {ncr}".format(n=n, r=r, ncr=ncr)
16 | print(result)
17 |
--------------------------------------------------------------------------------
/bits_wilp/calculateFrequency.py:
--------------------------------------------------------------------------------
1 | def calculateFrequency(str):
2 | nums = str.split()
3 | freq = {}
4 |
5 | for i in nums:
6 | if i not in freq.keys():
7 | freq[i] = 1
8 | else:
9 | freq[i] += 1
10 |
11 | print(freq)
12 |
13 |
14 | print("Please enter the numbers seperated by space. \n Press ENTER to exit: ")
15 | x = input()
16 | calculateFrequency(x)
17 |
--------------------------------------------------------------------------------
/bits_wilp/isAngstrom.py:
--------------------------------------------------------------------------------
1 | def isAngstrom(n):
2 | result = False
3 | sum = 0
4 | order = len(n)
5 |
6 | for i in n:
7 | sum = sum + int(i)**order
8 |
9 | if sum == int(n):
10 | result = True
11 | return result
12 |
13 | print("Please enter a number: ")
14 | num = input()
15 | flag = isAngstrom(num)
16 |
17 | if flag:
18 | print(num, "is an Angstrom number")
19 | else:
20 | print(num, "is NOT an Angstrom number")
21 |
--------------------------------------------------------------------------------
/bits_wilp/isPalindrome.py:
--------------------------------------------------------------------------------
1 | def isPalindrome(str):
2 | result = False
3 |
4 | if str == str[::-1]:
5 | result = True
6 |
7 | return result
8 |
9 | print("Please enter a string: ")
10 | x = input()
11 | flag = isPalindrome(x)
12 |
13 | if flag:
14 | print(x, "is a Palindrome")
15 | else:
16 | print(x, "is NOT a Palindrome")
17 |
--------------------------------------------------------------------------------
/bits_wilp/practice.py:
--------------------------------------------------------------------------------
1 | def most_common(str_a, str_b):
2 | return set(str_a) & set(str_b)
3 |
4 | result = most_common("NAINA", "RENNE")
5 | print(result) # {N}
6 |
7 | def get_freq(str):
8 | freq_dict = {}
9 | for char in str.split():
10 | if char not in freq_dict.keys():
11 | freq_dict[char] = 1
12 | else:
13 | freq_dict[char] += 1
14 |
15 | return freq_dict
16 |
17 | result = get_freq("Amogh loves to eat apple and mango. His sister also loves eating apple and mango")
18 | print(result)
19 | # {'Amogh': 1, 'loves': 2, 'to': 1, 'eat': 1, 'apple': 2, 'and': 2, 'mango.': 1, 'His': 1, 'sister': 1, 'also': 1, 'eating': 1, 'mango': 1}
20 |
21 |
22 |
23 | def is_prime(num):
24 | flag = 1
25 |
26 | for i in range(2, num//2):
27 | if num % i == 0:
28 | flag = 0
29 | break
30 |
31 | if flag == 0:
32 | print(f"{num} is not prime...")
33 | else:
34 | print(f"{num} is prime...")
35 |
36 | is_prime(7919) # 7919 is prime
37 |
38 |
39 | def fibo_iter(n_terms):
40 | first, second = 0, 1
41 | for i in range(0, n_terms):
42 | if i <= 1:
43 | result = i
44 | else:
45 | result = first + second
46 | first = second
47 | second = result
48 | print(result, end=' ')
49 |
50 | fibo_iter(5) # 0 1 1 2 3
51 |
52 | def fibo_recur(n):
53 | if n == 0:
54 | return 0
55 | elif n == 1:
56 | return 1
57 | else:
58 | return fibo_recur(n-1) + fibo_recur(n-2)
59 |
60 |
61 | for i in range(0, 10):
62 | print(fibo_recur(i), end=' ') # 0 1 1 2 3 5 8 13 21 34
63 |
--------------------------------------------------------------------------------
/bits_wilp/primeFactorization.py:
--------------------------------------------------------------------------------
1 | from math import sqrt
2 |
3 | def get_prime_factors(num):
4 | factors = []
5 |
6 | # get all the 2's
7 | while num % 2 == 0:
8 | factors.append(2)
9 | num = num / 2
10 |
11 | # check for other prime factors
12 | # sqrt is used to reduce the range by log(n)
13 | # step size of 2 to avoid checking with even numbers
14 | for i in range(3, int(sqrt(num))+1, 2):
15 | while num % i == 0:
16 | # print(num, i)
17 | factors.append(i)
18 | num = num / i
19 |
20 | # num is now the last prime number
21 | if num > 2:
22 | factors.append(int(num))
23 |
24 | return factors
25 |
26 |
27 | n = int(input("Enter the number: "))
28 | result = get_prime_factors(n)
29 |
30 | print("The factors of {n} are {result}".format(n=n, result=result))
31 |
32 | # Enter the number: 1081310109
33 | # The factors of 1081310109 are [3, 11, 17, 23, 181, 463]
34 |
--------------------------------------------------------------------------------
/bits_wilp/sample.txt:
--------------------------------------------------------------------------------
1 | What is Lorem Ipsum?
2 | Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
3 |
4 | Why do we use it?
5 | It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like).
6 |
7 |
8 | Where does it come from?
9 | Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.
10 |
11 | The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham.
12 |
--------------------------------------------------------------------------------
/bits_wilp/searching.py:
--------------------------------------------------------------------------------
1 |
2 | # Searching: Given a sorted array arr[] of n elements, write a
3 | # function to search a given element x in arr[]. Do it using linear
4 | # and binary search techniques.
5 |
6 | def linear_search(arr, elem):
7 | for i in range(0,len(arr)):
8 | if arr[i] == elem:
9 | print("{elem} found at {index} !".format(elem=elem, index=i))
10 | return
11 | print("{elem} not found in given sequence".format(elem=elem))
12 |
13 | def binary_search(arr, elem, l, r):
14 | try:
15 | mid = (l + r) // 2
16 |
17 | if elem == arr[mid]:
18 | print("{elem} found at {index} !".format(elem=elem, index=mid))
19 | return
20 | elif elem > arr[mid]:
21 | binary_search(arr, elem, l=mid+1, r=r)
22 | elif elem < arr[mid]:
23 | binary_search(arr, elem, l=l, r=mid-1)
24 |
25 | except RecursionError as e:
26 | print("{elem} not found in given sequence".format(elem=elem))
27 | print(e)
28 |
29 |
30 | arr = list(map(int, input("Enter numbers seperated by space. Press ENTER to exit: ").split()))
31 | arr.sort()
32 | print("Given sequence is :")
33 | print(arr)
34 |
35 | elem = int(input("Enter element to be searched: "))
36 | choice = int(input("Choose search method: \n 1. Linear Search \n 2. Binary Search \n"))
37 |
38 | if choice == 1:
39 | linear_search(arr, elem)
40 | elif choice == 2:
41 | binary_search(arr, elem, l=0, r=len(arr)-1)
42 | else:
43 | print("Error: Please enter a valid choice !")
44 |
--------------------------------------------------------------------------------
/bits_wilp/sumOfDigits.py:
--------------------------------------------------------------------------------
1 | def sumOfDigits(n):
2 | sum = 0
3 | while n > 0:
4 | rem = n % 10
5 | sum = sum + rem
6 | n = n // 10
7 | return sum
8 |
9 | print("Please enter a number: ")
10 | num = int(input())
11 | sod = sumOfDigits(num)
12 | print("The sum of digits for", num, "is", sod)
13 |
--------------------------------------------------------------------------------
/bits_wilp/topThreeFrequent.py:
--------------------------------------------------------------------------------
1 | filepath = './sample.txt'
2 | freq_counter = {}
3 |
4 | # using context manager
5 | with open(filepath, mode='r') as handle:
6 | content = handle.read()
7 | words = content.split()
8 | # file is closed now
9 |
10 | for w in words:
11 | if w not in freq_counter.keys():
12 | freq_counter[w] = 1
13 | else:
14 | freq_counter[w] += 1
15 |
16 | # sorting the sequence by values in `reverse` order
17 | sorted_by_freq = sorted(freq_counter, key=freq_counter.get, reverse=True)
18 |
19 | top_three = sorted_by_freq[:3]
20 | for k in top_three:
21 | print(k, ":", freq_counter[k])
22 |
--------------------------------------------------------------------------------
/bresenham_line_algorithm.py:
--------------------------------------------------------------------------------
1 | # Bresenham Line Algorithm (BLA) is one of the earliest algorithms developed
2 | # in computer graphics. It is used for drawing lines. It is an efficient method because
3 | # it involves only integer addition, subtractions, and multiplication operations.
4 |
5 | # These operations can be performed very rapidly so lines can be generated quickly.
6 |
7 | # Reference: http://floppsie.comp.glam.ac.uk/Southwales/gaius/gametools/6.html
8 |
9 | # Algorithm:
10 | # 1. We are given the starting and ending point (x1, y1) and (x2, y2)
11 | # 2. We compute the gradient m, using the formula: m = (y2-y1)/(x2-x1)
12 | # 3. The equation of the straight line is y = m*x+c. So the next thing we need to find is the intercept c
13 | # 4. Intercept can be derived using the formula c = y1 - m*x1
14 | # 5. To get the next point, we add dx to the x-cordinate and dy to the y cordinate
15 | # 6. We continue this cycle until we reach (x2, y2)
16 |
17 | def lineGenerator(x1, y1, x2, y2):
18 | dx = x2 - x1
19 | dy = y2 - y1
20 |
21 | slope = 2*dy - dx
22 |
23 | x = x1
24 | y = y1
25 | while x < x2:
26 |
27 | #Print current coordinates
28 | print(x, y)
29 |
30 | #X increases any ways
31 | x+= 1
32 |
33 | # 2dy is always added in the slope. Do it.
34 | slope += 2*dy
35 | #Check for the current slope
36 | if slope >= 0:
37 | y += 1
38 | slope -= 2 * (x2-x1)
39 |
40 | elif slope <=0:
41 | #No changes are made.
42 | slope = slope
43 |
44 |
45 | # lineGenerator(3, 2, 15, 5)
46 |
47 | # # P1 is the point given. Initial point
48 | # # P2 is the point to reach. Final point
49 | # if P1[0] == P2[0] and P1[1] == P2[1]:
50 | # return 0
51 | # print(P1)
52 | # #Check if the point is above or below the line.
53 | # dx = P2[0]-P1[0]
54 | # dy = P2[1]-P1[0]
55 |
56 | # di = 2*dy - dx
57 |
58 | # currX = P1[0]
59 | # currY = P1[1]
60 |
61 | # if di > 0:
62 | # P1 = (currX+1, currY+1)
63 | # else:
64 | # P1 = (currX+1, currY)
65 |
66 | # return lineGenerator(P1, P2)
67 |
--------------------------------------------------------------------------------
/bst_nodes_in_range.py:
--------------------------------------------------------------------------------
1 | # Problem: Find the no. of nodes in a BST that lies in a given range
2 |
3 | # Algorithm: We will traverse the tree recursively until we encounter leaf nodes (Base case)
4 | # else we do the following
5 | # 1. Current node is less than the given range --> Traverse the right subtree
6 | # 2. Current node is more than the given range --> Traverse the left subtree
7 | # 3. Current node lies in the given range --> Increment Count; Traverse both the left and right subtree
8 |
9 | # Data Structure for Tree Node
10 | class Node:
11 | def __init__(self, data):
12 | self.data = data
13 | self.left = None
14 | self.right = None
15 |
16 |
17 | def nodesWithinRange(root, range):
18 | low, high = range
19 | # this is the base case
20 | if root is None:
21 | return 0
22 | # optional: to improve efficiency
23 | elif root.data == high or root.data == low:
24 | return 1
25 | # if the current node lies in the range
26 | elif root.data <= high and root.data >= low:
27 | return (
28 | 1 + nodesWithinRange(root.left, range) + nodesWithinRange(root.right, range)
29 | )
30 | # if the current node lies in left subtree
31 | elif root.data > high:
32 | return nodesWithinRange(root.left, range)
33 | # if the current node lies in the right subtree
34 | elif root.data < low:
35 | return nodesWithinRange(root.right, range)
36 |
37 | if __name__ == "main":
38 |
39 | node = Node(10)
40 | node.left = Node(5)
41 | node.left.left = Node(1)
42 | node.right = Node(50)
43 | node.right.left = Node(45)
44 | node.right.right = Node(100)
45 |
46 | result = nodesWithinRange(node, (5, 45))
47 | print(result)
48 |
--------------------------------------------------------------------------------
/bubble_sort.py:
--------------------------------------------------------------------------------
1 | # A simple implementation of bubble sort
2 |
3 | def bubbleSort(arr):
4 | # travese the whole array
5 | for i in range(len(arr)):
6 |
7 | # last i elements are already in place
8 | for j in range(0, len(arr)-i-1):
9 |
10 | if arr[j] > arr[j+1]:
11 | arr[j], arr[j+1] = arr[j+1], arr[j]
12 |
13 | return arr
14 |
15 | # Approach 2: This algorithm will run for O(n^2) even if the array is
16 | # already sorted. For avoiding this, we can check if elements are swapped
17 | # in each pass. We will break the loop in case they are not
18 |
19 | # Time Complexity: O(n^2) - Average or Worst Case; O(n) - Best case [Array is already sorted]
20 |
21 | def bubbleSortOptimized(arr):
22 | for i in range(len(arr)):
23 | swapped = False
24 |
25 | for j in range(0, len(arr)-i-1):
26 |
27 | if arr[j] > arr[j+1]:
28 | arr[j], arr[j+1] = arr[j+1], arr[j]
29 | swapped = True
30 |
31 | # if no elements are swapped, break the loop
32 | if swapped == False:
33 | break
34 |
35 | return arr
36 |
37 | if __name__ = "__main__":
38 | arr = [2, 6, 1, 5, 3, 4]
39 | res = bubbleSort(arr)
40 | print(res)
41 |
--------------------------------------------------------------------------------
/calculateClockAngle.py:
--------------------------------------------------------------------------------
1 | # Find the angle made by the hour hand and the minute
2 | # hand at any given time. Assume it is an analog clock
3 |
4 | def calculateAngle(hour, minute):
5 | if hour < 0 or minute < 0 or hour > 12 or minute > 60:
6 | print("Wrong inputs given...")
7 | return
8 | else:
9 |
10 | if hour == 12:
11 | hour = 0
12 | if minute == 60:
13 | minute = 0
14 |
15 | # hour hand moves 360° in 12 hours i.e.
16 | # 360/12*60 ==> 0.5° every minute
17 |
18 | # similarly minute hand moves 360° in 1 hour i.e.
19 | # 360/60 ==> 6° every minute
20 |
21 | hour_angle = (hour * 60 + minute) * 0.5
22 | minute_angle = minute * 6
23 |
24 | # We take the absolute difference
25 | # and then return the acute angle between the two
26 | difference = abs(hour_angle - minute_angle)
27 |
28 | return min(difference, 360 - difference)
29 |
30 | input_time = (9, 30)
31 | print("The angle between hour and minute hand is: ", calculateAngle(input_time[0], input_time[1]), '\u00b0')
32 |
--------------------------------------------------------------------------------
/check_anagrams.py:
--------------------------------------------------------------------------------
1 | # Problem: Two strings of sizes m and n are given,
2 | # we have to find how many characters need to be
3 | # removed from both the string so that they become
4 | # anagrams of each other
5 |
6 | # Anagrams: Words that are made from rearranging the letters of another word
7 |
8 | # Algorithm: We will use dictionaries to keep track of characters.
9 | # The idea is to get the common occuring characters and derive
10 | # uncommon characters
11 |
12 | import string
13 |
14 | # letters will be a string of the form "abc...xyz"
15 | # CHARACTER_HASH looks like this {'a'0, 'b':0 ...., 'z':0}
16 | letters = string.ascii_lowercase
17 | CHARACTER_HASH = dict(zip(letters, [0] * len(letters)))
18 |
19 |
20 | # This method will mark all the letters occuring in 'text_a'
21 | def mapLettersToHash(text_a):
22 | for char in text_a:
23 | if char in CHARACTER_HASH.keys():
24 | CHARACTER_HASH[char] += 1
25 |
26 |
27 | # This method will count the letters present in 'text_b', also found in 'text_a'
28 | # These will be charcaters whose frequency in HASH is greater than zero
29 | def computeCommonLetters(text_b):
30 | common_letters = 0
31 | for char in text_b:
32 | if CHARACTER_HASH[char] > 0:
33 | common_letters += 1
34 | return common_letters
35 |
36 |
37 | # Now we derive how many uncommon letters are present,
38 | # This is done by subtracting twice the count of common letters
39 | # from the total length of both the strings
40 | def computeUncommonLetters(text_a, text_b, common_letters):
41 | return abs(len(text_a) + len(text_b) - (2 * common_letters))
42 |
43 | if __name__ == "__main__":
44 | text_1 = "hello"
45 | text_2 = "billion"
46 |
47 | mapLettersToHash(text_1)
48 | common = computeCommonLetters(text_2)
49 | result = computeUncommonLetters(text_1, text_2, common)
50 | print(result)
51 |
--------------------------------------------------------------------------------
/check_semiprime.py:
--------------------------------------------------------------------------------
1 | # Context: A semiprime is a product of two prime
2 | # numbers, not necessarily distinct.
3 | # Squares of prime numbers are also semiprimes.
4 |
5 | # Problem: Find the numbers which are semiprimes,
6 | # within a given range. For e.g. 1 to 100.
7 |
8 |
9 | def isSemiprime(num):
10 | # start with the smallest prime
11 | prime = 2
12 | # initialize counter to 0
13 | count = 0
14 | # Design of while loop:
15 | # 1. if count exceeds 2, it is not a semiprime, e.g. 30 = 2*3*5
16 | # 2. when the number becomes 1, we have found the second prime
17 | while count < 3 and num != 1:
18 | # if the number is divisible by current prime,
19 | # increment count, else move to new prime
20 | if not (num % prime):
21 | num = num / prime
22 | count = count + 1
23 | else:
24 | prime = prime + 1
25 | # if count is two, given number is a semiprime
26 | return count == 2
27 |
28 |
29 | for i in range(1, 100):
30 | if isSemiprime(i):
31 | print(i, end=" ")
32 |
33 | # Result: 4 6 9 10 14 15 21 22 25 26 33 34 35 38 39 46 49
34 | # 51 55 57 58 62 65 69 74 77 82 85 86 87 91 93 94 95
35 |
--------------------------------------------------------------------------------
/data_science_interviews.md:
--------------------------------------------------------------------------------
1 | What is PEP 8 and why is it important?
2 |
3 | What is Scope in Python?
4 |
5 | What are lists and tuples? What is the key difference between the two?
6 |
7 | What are modules and packages in Python?
8 |
9 | What is self in Python?
10 |
11 | What are decorators in Python?
12 |
13 | What is lambda in Python? Why is it used?
14 |
15 | What are generators in Python?
16 |
17 | Can you create a series from the dictionary object in pandas?
18 |
19 | How will you delete indices, rows, and columns from a data frame?
20 |
21 | Can you get items of series A that are not available in another series B?
22 |
23 | How are NumPy arrays advantageous over python lists?
24 |
25 | Write python function which takes a variable number of arguments.
26 |
27 | WAP (Write a program) which takes a sequence of numbers and checks if all numbers are unique.
28 |
29 | ****************************************************************************
30 |
31 | How do we use Eigenvalues and eigenvectors in PCA (Principal Components Analysis) ?
32 |
33 | Difference between exogenous and auto regression in time series forecasting.
34 |
35 | Difference between normalization and standardization, will it be used before train test split or after?
36 |
37 | How to reduce the impact of one feature than others
38 |
39 | Difference between XGBoost and FBProphet.
40 |
41 | Describe the scenario where you do not make stationery data in time series forecasting problem
42 |
43 | BERT is trained on which dataset? What model will be used if BERT does not exist? Describe self- attention mechanism.
44 |
45 | Difference between univariate and multivariate time series forecasting problems.
46 | ****************************************************************************
47 |
48 | Find the middle node of a given LinkedList.
49 | `Used two pointer approach`
50 | `Slow Pointer = node.next`, and
51 | `Fast pointer = node.next.next;`
52 | at each iteration check if any of the pointer equals to `null`
53 | When fast pointer is null slow pointer will be at the middle node just print node.data to get the result.
54 |
55 |
56 | Print all the permutations of give string.
57 | There are two approaches for this either
58 | we can use `permute` library or
59 | we can code using loops in `O(n^2)`.
60 |
61 |
62 | Third last node of LinkedList,
63 | above mentioned two pointer approach will be used here as well.
64 |
65 | Difference between `call by value` and `call by reference`.
66 | In call by value, we pass the copy of variable in the function whereas
67 | in call by reference we pass the actual variable into the function.
68 | How we do that? We pass the memory address of that variable to the function.
69 | These concepts are used with pointers in C/C++.
70 |
71 | Difference between `==` and `===` in JavaScript.
72 | Both are used for comparison
73 | double equal to is a content comparator whereas
74 | triple equals compares both content and data types of LHS & RHS.
75 |
76 | Difference between Breadth-first search & Depth first search.
77 |
78 |
79 | ****************************************************************************
80 | Explanation of the past project. What were the features used and how did you determine performance?
81 |
82 | What is the difference between linear regression and logistic regression?
83 |
84 | What is the internal working of logistic regression (LR)?
85 |
86 | What is the loss function of LR?
87 |
88 | Name some hyperparameters used in LR? Why do we use regularization?
89 |
90 | When do we use accuracy as a metric? When should we not use accuracy?
91 |
92 | How do you deal with imbalance data?
93 |
94 | What is SMOTE and how is it different from stratified sampling?
95 |
96 | Watch this video to understand how SMOTE works [https://www.youtube.com/watch?v=U3X98xZ4_no]
97 |
98 | What is better 0.51 AUC (Area Under the Curve) or 0.43 F1 score? Which one should you present to a client?
99 |
100 | Watch this video to understand how AUC is interpreted [https://www.youtube.com/watch?v=mUMd_cKU0VM]
101 |
102 | What does the ROC AUC value signify?
103 |
104 | Do we only use the threshold of 0.5 or can we use other thresholds in LR? If yes, how do we find them?
105 |
106 | Can I use a sales forecasting model built using pencils data to be used in erasers data?
107 |
108 | How would you compare the performance of two forecasting models?
109 |
110 | What are the different metrics used in regression analysis? Which metric should be used where?
111 |
112 | How do you build a testing pipeline for a data science model? [https://www.kdnuggets.com/2020/08/unit-test-data-pipeline-thank-yourself-later.html]
113 |
114 | ****************************************************************************
115 |
116 | How does Iterators and generators work in Python ?
117 |
118 | What does Python constructors do and how are they useful ?
119 |
120 | Explain what Map function does in Python ?
121 |
122 | How do you flatten an image(matrix) in a deep learning architecture ?
123 |
124 | Difference between semantic segmentation and instance segmentation ?
125 |
126 | Which are the different types of pooling operations - what is the visual effect of applying a max pooling operation and average pooling operation on an image ?
127 |
128 | What is the math behind convolution operation – what will be the size of a particular image (128*128) after convolution operation with a 3*3 kernel ?
129 |
130 | what will be the size of a particular image (128*128) after convolution operation for a 3*3 image after applying 1*1 kernel ?
131 |
132 | What is the Loss function and optimization function of region proposal network ?
133 |
134 | What is Image down sampling – why do we do down sampling ?
135 |
136 | Python coding: Solve the following using a for loop, by defining a function and put in inside a class
137 |
138 | `#Input : a =[1,2,3] `
139 |
140 | `#Output : ["hello1","hello2","hello3"]`
141 |
142 | Tradeoff between YOLO and FasterRCNN in terms of speed and accuracy ?
143 |
144 | What are feature maps and how are they obtained ?
145 |
146 | ****************************************************************************
147 | How will you count unique values in a data frame column.
148 |
149 | How will you convert a column data type to string ?
150 |
151 | How will you obtain correlation coefficient between 2 columns in a data frame ?
152 |
153 | How will you merge two data frame based on common column (when column name is same) ?
154 |
155 | How will you merge you merge two data frames base on common column name (column name is different in left and right data frame) ?
156 |
157 | Define the term correlation with respect to statistics ?
158 |
159 | What are the types of correlation coefficient?
160 |
161 | What is the difference in Pearson correlation coefficient and spearmen correlation coefficient?
162 |
163 | How do we deal with categorical variables for statistical analysis?
164 |
165 | How do you obtain correlation between 2 categorical variables?
166 | How do you find Correlation between one categorical variable and other numerical variables?
167 |
168 | What is the difference between dictionary and list?
169 |
170 | How do you append a dictionary with another dictionary?
171 |
172 | What is the difference between tuples and list ?
173 |
174 | Can a tuple have different data types of element contained within it ?
175 |
176 | How do you read data from database directly and convert it into data frame for analysis?
177 |
178 | How do you import file.py function into another python file ?
179 |
180 | What are generators in python ?
181 |
182 | How will you print index and values of a list without range function ?
183 |
184 | ****************************************************************************
185 |
186 | What is the difference between Docker and Containers?
187 |
188 | How do you restart containers on failure?
189 |
190 | How do you run a container in Docker?
191 |
192 | Can you run a program that takes 4 hours to run in AWS Lambda?
193 |
194 | What is the difference between ADD and COPY commands wrt. Dockerfile ?
195 |
196 | Experience with different AWS services such as CloudFormation or Glue?
197 |
198 | What is the schema in S3?
199 |
200 | Can the lambda written in AWS interact with other infrastructure?
201 |
202 | What is the Dockerfile setup if you want to expose the model as an API?
203 |
204 | Difference between UDF, pandas UDF and pyspark UDFs?
205 |
206 | Difference between synchronous and asynchronous request? How do you program one in Python?
207 |
208 | What is the use of a DAG (Directed Acyclic Graph) in Spark?
209 |
210 | Given the no. Of terms, print the Fibonacci sequence: Hint try both iterative and recursive methods [https://www.programiz.com/python-programming/examples/fibonacci-sequence]
211 |
212 | Given an input string, print the length of the longest common substring without any repeating characters. [https://leetcode.com/problems/longest-substring-without-repeating-characters/]
213 |
214 | Given an input string, write a function that returns the Run Length Encoded string for the input string. For example, if the input string is “ssslbbbbppiitttc”, then the function should return “s3l1b4p2i2t3c1”
215 |
216 | ****************************************************************************
217 |
218 | Given a list, `ls = [9,8,3,4,1,0,2,7,7,6]`, write a function to get nth highest element without using any inbuilt functions or sorting.
219 |
220 | Write a python class with method to sort a list and related questions on classes, static methods, init etc.
221 |
222 | Difference between `RANK` and `DENSE RANK`?
223 |
224 | Difference between `parquet` and `csv` file format? How are files written in a parquet file?
225 |
226 | What is Cursor command in SQL?
227 |
228 | Difference between Spark vs MapReduce architecture?
229 |
230 | Explanation of ETL pipeline
231 |
232 | Containerization v/s virtualization
233 |
234 | What is port redirection in docker?
235 |
236 | How to create a table with Databricks storage?
237 |
238 | Difference between SQL and NoSQL DB?
239 |
240 | A scenario where data keeps on changing, with adding and updating new features , would you consider SQL or NoSQL?
241 |
242 | What is the difference between iterators and generators
243 |
244 | What is the difference between OLAP and OLTP?
245 |
246 | ****************************************************************************
247 |
--------------------------------------------------------------------------------
/dfs_bfs.py:
--------------------------------------------------------------------------------
1 | # Given a graph, there are two methods to
2 | # perform traversal on it.
3 | # 1. Depth First Search (DFS)
4 | # 2. Breadth First Search (BFS)
5 |
6 | # Breadth First Search:
7 | # We check the adjacent nodes first and then mark them visited to explore their adjacent nodes.
8 | # This uses a queue to keep track of the visited nodes in FIFO style
9 | # In BFS, one vertex is selected at a time when it is visited and marked then its adjacent are visited and stored in the queue
10 | # The goal is to get the shortest path by traversing the minimum no. of edges in the graph.
11 | # BFS tries all the possible path at the same time and then use a tie breaking strategy to decide the best path
12 | # It is used to search the solution in the nearest nodes
13 | # Hence it is useful for social networks where in-depth exploration is not requirewd
14 |
15 | # Depth First Search:
16 | # We check for exploration on the first single path we discovered and go deep until we encounter a dead end
17 | # This uses a stack to keep track of the visited node and perform `backtracking` in case a dead end is met
18 | # DFS is faster than BFS when exploration is the priority
19 | # DFS will always find a path but that may not be the shortest path, unlike BFS
20 |
21 | def dfs_1(graph, start):
22 | visited, stack = set(), [start]
23 | while stack:
24 | vertex = stack.pop()
25 | if vertex not in visited:
26 | visited.add(vertex)
27 | stack.extend(graph[vertex] - visited)
28 | return visited
29 |
30 |
31 | def dfs_2(graph, start, visited=None):
32 | if visited is None:
33 | visited = set()
34 | visited.add(start)
35 | for next in graph[start] - visited:
36 | dfs_2(graph, next, visited)
37 | return visited
38 |
39 |
40 | def bfs(graph, start):
41 | visited, queue = set(), [start]
42 | while queue:
43 | vertex = queue.pop(0)
44 | if vertex not in visited:
45 | visited.add(vertex)
46 | queue.extend(graph[vertex] - visited)
47 | return visited
48 |
49 |
50 | # bfs(graph, 'A') # {'B', 'C', 'A', 'F', 'D', 'E'}
51 |
52 |
53 | def dfs_paths(graph, start, goal):
54 | stack = [(start, [start])]
55 | while stack:
56 | (vertex, path) = stack.pop()
57 | for next in graph[vertex] - set(path):
58 | if next == goal:
59 | yield path + [next]
60 | else:
61 | stack.append((next, path + [next]))
62 |
63 |
64 | graph = {
65 | "A": set(["B", "C"]),
66 | "B": set(["A", "D", "E"]),
67 | "C": set(["A", "F"]),
68 | "D": set(["B"]),
69 | "E": set(["B", "F"]),
70 | "F": set(["C", "E"]),
71 | }
72 |
73 | result = list(dfs_paths(graph, "A", "F")) # [['A', 'C', 'F'], ['A', 'B', 'E', 'F']]
74 | print(result)
75 |
--------------------------------------------------------------------------------
/diameterOfTree.py:
--------------------------------------------------------------------------------
1 | # The diameter of a tree (sometimes called the width)
2 | # is the number of nodes on the longest path between
3 | # two end nodes(leftmost leaf node and rightmost leaf node).
4 |
5 | # The diameter of a tree T is the largest of the following quantities:
6 | # the diameter of T’s left subtree
7 | # the diameter of T’s right subtree
8 | # the longest path between leaves that goes through the root of T (this can be computed from the heights of the subtrees of T)
9 |
10 | # Algorithm:
11 | # Case 1: if the diameter passes through origin, then diameter is
12 | # height of left subtree + height of right subtree + 1 (root node)
13 | # d = lheight + rheight + 1
14 |
15 | # Case 2: if the diameter is not passing through origin
16 | # Search for diameter in the left subtree and right subtree
17 | # Pick the larger value of the two subtrees
18 | # d = max(ldiameter, rdiameter)
19 |
20 | # Finally take max of the two values since we do not
21 | # know if diameter is passing through the root or not
22 | # d = max(lheight + rheight + 1, max(ldiameter, rdiameter))
23 |
24 | class Node(object):
25 | def __init__(self, data):
26 | self.data = data
27 | self.left = None
28 | self.right = None
29 |
30 | def height(tree):
31 | if tree is None:
32 | return 0
33 | else:
34 | return 1 + max(height(tree.left), height(tree.right))
35 |
36 | def diameter(tree):
37 | if tree is None:
38 | return 0
39 |
40 | else:
41 | lheight = height(tree.left)
42 | rheight = height(tree.right)
43 |
44 | ldiameter = diameter(tree.left)
45 | rdiameter = diameter(tree.right)
46 |
47 | return max(rheight + lheight + 1, max(ldiameter, rdiameter))
48 |
49 | root = Node(1)
50 | root.left = Node(2)
51 | root.right = Node(3)
52 | root.left.left = Node(4)
53 | root.left.right = Node(5)
54 | print("Diameter of given binary tree is ",diameter(root))
55 |
--------------------------------------------------------------------------------
/estimate_pi.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from math import pi as PI
3 |
4 | def estimate_pi(sims):
5 | """
6 | takes the number of simulations as input to estimate pi
7 | """
8 |
9 | # counter to hold points lying inside the circle
10 | in_circle = 0
11 |
12 | for s in range(0,sims):
13 |
14 | x = np.random.rand()
15 | y = np.random.rand()
16 |
17 | if (x**2 + y**2) <= 1:
18 | in_circle += 1
19 |
20 | # The ratio of pts. inside the circle and the total pts. will be same as the ratio
21 | # of the area of circle to the area of the square, inside which the circle is inscribed
22 | # Area of circle = PI * R * R
23 | # Area of square = (2R) * (2R)
24 |
25 | pi_estimated = 4.0 * in_circle / sims
26 |
27 | print("Simulations ran: ", sims)
28 | print("Estimated pi", pi_estimated)
29 | print("Error", PI - pi_estimated)
30 |
31 | pow = 0
32 | input_sims = 100
33 | while pow <= 8:
34 | estimate_pi(sims=input_sims)
35 | pow += 1
36 | input_sims *= 10
37 |
38 |
--------------------------------------------------------------------------------
/find_k_largest.py:
--------------------------------------------------------------------------------
1 | # Write an efficient program for
2 | # printing k largest elements in
3 | # an array. Elements in array can
4 | # be in any order.
5 | # Time Complexity: O(NlogN) + O(k)
6 |
7 | def findKLargest(arr, k):
8 | arr.sort(reverse=True)
9 | for i in range(0, k):
10 | print(arr[i], end=" ")
11 |
12 | arr = [1, 23, 12, 9, 30, 2, 50]
13 | k = 3
14 | findKLargest(arr, k)
15 |
--------------------------------------------------------------------------------
/find_m_to_last_llist.py:
--------------------------------------------------------------------------------
1 | # Given a linked list, this method
2 | # will return m'th element to the last
3 | # 2->3->4->8->5; m=2 will return 8
4 | # since 8 is second to last
5 |
6 | from linked_list_data_structure import LinkedList
7 |
8 |
9 | def findMToLast(l_list, m):
10 | current = l_list.head
11 | count = 0
12 |
13 | while current is not None and count < m:
14 | count += 1
15 | current = current.getNextNode()
16 |
17 | m_behind = l_list.head
18 | while current.next_node is not None:
19 | current = current.getNextNode()
20 | m_behind = m_behind.getNextNode()
21 |
22 | return m_behind
23 |
24 |
25 | linked_list = LinkedList()
26 | m_to_last = 3
27 | # Returns the third element from last
28 | print(findMToLast(linked_list, m_to_last))
29 |
--------------------------------------------------------------------------------
/find_pairs_sum_k.py:
--------------------------------------------------------------------------------
1 | # Given an array of numbers, find all the
2 | # pairs of numbers which sum upto `k`
3 |
4 |
5 | def find_pairs(num_array, k):
6 | pairs_array = []
7 | for num in num_array:
8 | if (k - num) in num_array:
9 | pairs_array.append((num, (k - num)))
10 | return pairs_array
11 |
12 |
13 | result = find_pairs([0, 14, 0, 4, 7, 8, 3, 5, 7], 11)
14 | print(result)
15 |
--------------------------------------------------------------------------------
/find_products_pair_k.py:
--------------------------------------------------------------------------------
1 | def product_pair(arr, x):
2 | arr_sorted = sorted(arr)
3 |
4 | for i in range(0, len(arr_sorted)):
5 | sub_array = arr_sorted[i + 1 :]
6 | if x // arr_sorted[i] in sub_array:
7 | return True
8 |
9 | return False
10 |
11 |
12 | arr = [10, 20, 9, 40]
13 | x = 400
14 |
15 | res = product_pair(arr, x)
16 | print(res)
17 |
--------------------------------------------------------------------------------
/find_pythagoras_triplet.py:
--------------------------------------------------------------------------------
1 | # Given an array of integers, write a function that
2 | # returns true if there is a triplet (a, b, c) that
3 | # satisfies a^2 + b^2 = c^2.
4 |
5 | def findPythagorasTriplet(arr, n):
6 | # convert the array to squares
7 | for i in range(0, n):
8 | arr[i] = arr[i] * arr[i]
9 |
10 | # sort the array
11 | arr.sort()
12 |
13 | # use meet in the middle to find the pair (a,b)
14 | for i in range(n-1, 1, -1):
15 | j = 0
16 | k = i-1
17 |
18 | while (j < k):
19 | # a pair is found
20 | if (arr[j] + arr[k]) == arr[i]:
21 | return True
22 | else:
23 | if (arr[j] + arr[k]) < arr[i]:
24 | j = j + 1
25 | else:
26 | k = k - 1
27 |
28 | return False
29 |
30 | ar = [3, 1, 4, 6, 5]
31 | ar_size = len(ar)
32 | if(findPythagorasTriplet(ar, ar_size)):
33 | print("Yes")
34 | else:
35 | print("No")
36 |
--------------------------------------------------------------------------------
/find_second_largest_in_binary_tree.py:
--------------------------------------------------------------------------------
1 | # Given a binary tree, find the second largest
2 | # node in it
3 |
4 |
5 | class Node:
6 | def __init__(self, data):
7 | self.data = data
8 | self.left = None
9 | self.right = None
10 |
11 |
12 | def find_largest(root):
13 | current = root
14 | while current is not None:
15 | if current.right is not None:
16 | return current.right.data
17 | current = current.right
18 |
19 |
20 | def find_second_largest(root):
21 | if root is None or (root.left is None and root.right is None):
22 | raise ValueError("Tree must atleast have 2 nodes")
23 |
24 | current = root
25 |
26 | while current is not None:
27 | if current.left is not None and current.right is None:
28 | return find_largest(current.left)
29 |
30 | if (
31 | current.right is not None
32 | and current.right.left is None
33 | and current.right.right is None
34 | ):
35 | return current.data
36 |
37 | current = current.right
38 |
39 |
40 | node = Node(10)
41 | node.left = Node(5)
42 | node.left.left = Node(1)
43 | node.right = Node(50)
44 | node.right.left = Node(45)
45 | node.right.right = Node(100)
46 |
47 | result = find_second_largest(node)
48 | print(result) # prints 50
49 |
--------------------------------------------------------------------------------
/first_n_fibo.py:
--------------------------------------------------------------------------------
1 | # Write a function that computes the
2 | # list of the first 100 Fibonacci numbers
3 |
4 | FIB_ARR = [0, 1]
5 |
6 | def first_n_fibo(n):
7 | if n < 2:
8 | return FIB_ARR
9 | else:
10 | while len(FIB_ARR) < n:
11 | FIB_ARR.append(FIB_ARR[-1] + FIB_ARR[-2])
12 | return FIB_ARR
13 |
14 |
15 | n = 10
16 | arr = first_n_fibo(n)
17 | print(arr)
18 |
--------------------------------------------------------------------------------
/first_non_repeating.py:
--------------------------------------------------------------------------------
1 | # Given an input string, it gives the
2 | # first non repeating character in it
3 | # There are two implementations below
4 | # 1. has less space complexity
5 | # 2. has less time complexity
6 |
7 |
8 | def first_non_repeating(input_string):
9 | frequency = dict()
10 | flag = None
11 |
12 | for char in input_string:
13 | if char in frequency.keys():
14 | frequency[char] += 1
15 | else:
16 | frequency[char] = 0
17 |
18 | for char in input_string:
19 | if frequency[char] == 0:
20 | flag = char
21 | break
22 |
23 | return flag
24 |
25 |
26 | # lesser time complexity
27 | # more space complexity
28 | # obvious space-time trade-off
29 | def first_non_repeating_v2(input_string):
30 |
31 | flag = None
32 | repeating = []
33 | non_repeating = []
34 |
35 | for char in input_string:
36 | if char in non_repeating:
37 | non_repeating.remove(char)
38 | repeating.append(char)
39 | else:
40 | non_repeating.append(char)
41 |
42 | if len(non_repeating) == 0:
43 | pass
44 | else:
45 | flag = non_repeating[0]
46 |
47 | return flag
48 |
49 |
50 | result = first_non_repeating("djebdedbekfrnkfnduwbdwkd")
51 | print(result) # j
52 |
53 | result = first_non_repeating("aabbcc")
54 | print(result) # None
55 |
--------------------------------------------------------------------------------
/first_recurring_character.py:
--------------------------------------------------------------------------------
1 | # Given an input string, find the first
2 | # recurring character in it.
3 |
4 |
5 | def first_recurring_character(input):
6 | flag = None
7 | d = dict()
8 | for char in input:
9 | if char in d.keys():
10 | flag = char
11 | return flag
12 | d[char] = 1
13 | return flag
14 |
15 |
16 | result = first_recurring_character("DFGHJWERGBFGHJ")
17 | print(result) # G
18 | result = first_recurring_character("ABCDEFGH")
19 | print(result) # None
20 | result = first_recurring_character("12345642124345")
21 | print(result) # 4
22 |
--------------------------------------------------------------------------------
/first_unique_letter.py:
--------------------------------------------------------------------------------
1 | # Problem: Given a string, find the first non-repeating
2 | # character in it. For example, if the input string is
3 | # “GeeksforGeeks”, then output should be ‘f’ and if input
4 | # string is “GeeksQuiz”, then output should be ‘G’.
5 |
6 | import string
7 |
8 | letters = string.ascii_lowercase
9 | CHARACTER_HASH = dict(zip(letters, [0] * len(letters)))
10 |
11 |
12 | def mapLettersToHash(text_a):
13 | for char in text_a:
14 | if char in CHARACTER_HASH.keys():
15 | CHARACTER_HASH[char] += 1
16 |
17 |
18 | def getFirstUniqueLetter(text_a):
19 | for char in text_a:
20 | if CHARACTER_HASH[char] == 1:
21 | return char
22 |
23 |
24 | text_1 = "geeksquiz"
25 |
26 | mapLettersToHash(text_1)
27 | result = getFirstUniqueLetter(text_1)
28 | print(result)
29 |
--------------------------------------------------------------------------------
/gamblers_ruin.py:
--------------------------------------------------------------------------------
1 | """
2 | David vs. Goliath Gambler's Ruin Simulation
3 |
4 | This program simulates a gambling scenario between two players: David and Goliath.
5 | David has a skill advantage, represented by a 55% probability of winning each round,
6 | while Goliath has a size advantage with a larger initial amount of money.
7 |
8 | Assumptions:
9 | - David starts with $2,000, and Goliath starts with $10,000.
10 | - Each round of betting results in a transfer of $1,000 from the loser to the winner.
11 | - The game continues until one player runs out of money (i.e., their amount reaches zero).
12 | - The outcome of each round is determined by a random number generator, reflecting David's skill advantage.
13 |
14 | Mathematics:
15 | - The simulation models a stochastic process where each round can be viewed as an independent Bernoulli trial:
16 | - David wins with a probability of 0.55.
17 | - Goliath wins with a probability of 0.45.
18 | - The expected outcomes can be analyzed using concepts from probability theory and stochastic processes.
19 | - The simulation runs for a specified number of trials to gather statistical data on how often David wins compared to Goliath.
20 |
21 | Usage:
22 | 1. Run the program in a Python environment.
23 | 2. Input the desired number of simulations when prompted.
24 | 3. The program will output the number of wins for both David and Goliath and display a bar chart of the results.
25 |
26 | This simulation provides insights into how skill can offset size advantages in competitive scenarios.
27 | """
28 |
29 | import random
30 | import matplotlib.pyplot as plt
31 |
32 | def gambler_ruin(david_initial, goliath_initial, david_win_prob, simulations):
33 | results = []
34 |
35 | for _ in range(simulations):
36 | david_amount = david_initial
37 | goliath_amount = goliath_initial
38 |
39 | while david_amount > 0 and goliath_amount > 0:
40 | # Simulate a single bet based on David's winning probability
41 | if random.random() < david_win_prob: # David wins
42 | david_amount += 1000
43 | goliath_amount -= 1000
44 | else: # Goliath wins
45 | david_amount -= 1000
46 | goliath_amount += 1000
47 |
48 | # Record the result: True if David wins, False if Goliath wins
49 | results.append(david_amount > 0)
50 |
51 | return results
52 |
53 | def plot_results(results):
54 | wins = sum(results)
55 | losses = len(results) - wins
56 |
57 | plt.bar(['David Wins', 'Goliath Wins'], [wins, losses], color=['blue', 'red'])
58 | plt.title('David vs. Goliath Simulation Results')
59 | plt.ylabel('Number of Simulations')
60 | plt.show()
61 |
62 | def main():
63 | david_initial = 2000 # David's initial amount
64 | goliath_initial = 10000 # Goliath's initial amount
65 | david_win_prob = 0.51 # David's skill advantage (55%)
66 | simulations = int(input("Enter number of simulations: "))
67 |
68 | results = gambler_ruin(david_initial, goliath_initial, david_win_prob, simulations)
69 |
70 | print(f"\nResults after {simulations} simulations:")
71 | print(f"David Wins: {sum(results)}")
72 | print(f"Goliath Wins: {len(results) - sum(results)}")
73 |
74 | plot_results(results)
75 |
76 | if __name__ == "__main__":
77 | main()
78 |
--------------------------------------------------------------------------------
/gen_largest_num_frm_list.py:
--------------------------------------------------------------------------------
1 | # Write a function that given a list of non
2 | # negative integers, arranges them such that
3 | # they form the largest possible number. For
4 | # example, given [50, 2, 1, 9], the largest
5 | # formed number is 95021
6 |
7 | from itertools import permutations
8 |
9 | def generate_largest_number(arr):
10 | gen_nums = []
11 | for i in permutations(arr, len(arr)):
12 | gen_nums.append("".join(map(str, i)))
13 | return max(gen_nums)
14 |
15 | arr = [54, 546, 548, 60]
16 | generate_largest_number(arr)
17 |
--------------------------------------------------------------------------------
/general_tree_structure.py:
--------------------------------------------------------------------------------
1 | class Node:
2 | def __init__(self, value=None, children=[]):
3 | self.value = value
4 | self.children = children
5 |
6 | def getValue(self):
7 | return self.value
8 |
9 | def setValue(self, new_value):
10 | self.value = new_value
11 |
12 | def getNumChildren(self):
13 | return len(self.children)
14 |
15 | def getChild(self, index):
16 | return self.children[index]
17 |
18 |
19 | root = Node(5, [Node(2), Node(6)])
20 | print(root.getValue()) # 5
21 | print(root.getNumChildren()) # 2
22 | print(root.getChild(1)) #
23 |
--------------------------------------------------------------------------------
/getMinPlatforms.py:
--------------------------------------------------------------------------------
1 | # Given the arrival and departure times of all trains
2 | # that reach a railway station, the task is to find the
3 | # minimum number of platforms required for the railway
4 | # station so that no train waits.
5 |
6 | # We are given two arrays that represent the arrival and
7 | # departure times of trains that stop.
8 |
9 | def getMinPlatforms(arr, dep):
10 | if len(arr) != len(dep):
11 | print("Wrong inputs given...")
12 | return
13 | else:
14 | sorted_arr = sorted(arr + dep)
15 |
16 | minPlatform = 0
17 | trainsAtPlatform = 0
18 |
19 | for i in sorted_arr:
20 | if i in arr:
21 | trainsAtPlatform += 1
22 | if i in dep:
23 | trainsAtPlatform -= 1
24 | minPlatform = max(minPlatform, trainsAtPlatform)
25 |
26 | return minPlatform
27 |
28 | arrivalArr = [900, 940, 950, 1100, 1500, 1800]
29 | departureArr = [910, 1200, 1120, 1130, 1900, 2000]
30 |
31 | result = getMinPlatforms(arrivalArr, departureArr)
32 |
33 | print("Minimum no. of platforms for given time table are: ", result)
34 |
--------------------------------------------------------------------------------
/get_dup_chars.py:
--------------------------------------------------------------------------------
1 | # Print duplicate characters in a string
2 |
3 |
4 | def get_dup_chars(input_str):
5 | dedupe_str = ""
6 | dup_chars = []
7 |
8 | for char in input_str:
9 | if char not in dedupe_str:
10 | dedupe_str += char
11 | else:
12 | dup_chars.append(char)
13 |
14 | return dup_chars
15 |
16 |
17 | result = get_dup_chars("zmaxxazkgv")
18 | print(result) # ['x', 'a', 'z']
19 |
--------------------------------------------------------------------------------
/hasZeroSumSubArray.py:
--------------------------------------------------------------------------------
1 | # This method returns the sum of numbers
2 | # present till each index
3 |
4 | def getPrefixArray(arr):
5 | return [sum(arr[:i]) for i in range(1, len(arr)+1)]
6 |
7 | # This method will create a mapping of numbers
8 | # and the indices they are present at
9 |
10 | def getIndexMap(arr):
11 | indexMap = {}
12 |
13 | for i in range(len(arr)):
14 | if arr[i] not in indexMap.keys():
15 | indexMap[arr[i]] = [i,]
16 | else:
17 | indexMap[arr[i]].append(i)
18 |
19 | return indexMap
20 |
21 | # This method will create the sum prefix of the
22 | # current array, if the sum is repeating, then
23 | # there is a zero sum sub array
24 |
25 | def hasZeroSum(arr):
26 | prefixArr = getPrefixArray(arr)
27 | sumAtIndexMap = getIndexMap(prefixArr)
28 |
29 | for v in sumAtIndexMap.values():
30 | if len(v) > 1:
31 | return True
32 | break
33 | return False
34 |
35 |
36 | ipArray = [1, 4, -2, -2, 5, -4, 3]
37 | hasZeroSumTrue = hasZeroSum(ipArray)
38 | print(hasZeroSumTrue)
39 |
--------------------------------------------------------------------------------
/has_only_digits.py:
--------------------------------------------------------------------------------
1 | # Given a string, check if it only contains digits.
2 |
3 |
4 | def is_digit(input_str):
5 | try:
6 | # If its possible to convert to a number, return True.
7 | int(input_str)
8 | return True
9 | except ValueError:
10 | # If the string contains letters, above will fail, returning False.
11 | return False
12 |
13 |
14 | result = is_digit("095357973590759530")
15 | print(result) # True
16 |
17 | result = is_digit("1234abc567")
18 | print(result) # False
19 |
--------------------------------------------------------------------------------
/haversine.py:
--------------------------------------------------------------------------------
1 | from math import radians, cos, sin, asin, sqrt
2 |
3 | def haversine(lon1, lat1, lon2, lat2):
4 | """
5 | Calculate the great circle distance between two points
6 | on the earth (specified in decimal degrees)
7 | """
8 | # convert decimal degrees to radians
9 | lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
10 |
11 | # haversine formula
12 | dlon = lon2 - lon1
13 | dlat = lat2 - lat1
14 | a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
15 | c = 2 * asin(sqrt(a))
16 | r = 6371 # Radius of earth in kilometers. Use 3956 for miles
17 | return c * r
18 |
--------------------------------------------------------------------------------
/heap_structure.py:
--------------------------------------------------------------------------------
1 | class Heap(object):
2 |
3 | HEAP_SIZE = 10
4 |
5 | def __init__(self):
6 | self.heap = [0] * Heap.HEAP_SIZE
7 | self.currentPosition = -1
8 |
9 | def insert(self, item):
10 |
11 | # if heap is full , we print a notification
12 | if self.isFull():
13 | print("Heap is full")
14 | return
15 | # else, increment the currentPosition and add item
16 | self.currentPosition += 1
17 | self.heap[self.currentPosition] = item
18 | self.fixUp(self.currentPosition)
19 |
20 | def fixUp(self, index):
21 | parentIndex = int((index - 1) / 2)
22 | while parentIndex >= 0 and self.heap[parentIndex] < self.heap[index]:
23 | # if True swap heap[index] and heap[parentIndex]
24 | temp = self.heap[index]
25 | self.heap[index] = self.heap[parentIndex]
26 | self.heap[parentIndex] = temp
27 | # update the index and parentIndex
28 | index = parentIndex
29 | parentIndex = int((index - 1) / 2)
30 |
31 | def fixDown(self, index, upto):
32 | if upto < 0:
33 | upto = self.currentPosition
34 |
35 | while index <= upto:
36 | leftChild = 2 * index + 1
37 | rightChild = 2 * index + 2
38 |
39 | if leftChild <= upto:
40 | childToSwap = 0
41 | else:
42 | if self.heap[leftChild] < self.heap[rightChild]:
43 | childToSwap = leftChild
44 | else:
45 | childToSwap = rightChild
46 |
47 | if self.heap[index] < self.heap[childToSwap]:
48 | temp = self.heap[index]
49 | self.heap[index] = self.heap[childToSwap]
50 | self.heap[childToSwap] = temp
51 | else:
52 | break
53 |
54 | index = childToSwap
55 |
56 | else:
57 | return
58 |
59 | def heapSort(self):
60 | for i in range(0, self.currentPosition + 1):
61 | temp = self.heap[0]
62 | print("%d" % temp)
63 | self.heap[0] = self.heap[self.currentPosition - i]
64 | self.heap[self.currentPosition - i] = temp
65 | self.fixDown(0, self.currentPosition - i - 1)
66 |
67 | def getMax(self):
68 | result = self.heap[0]
69 | self.currentPosition -= 1
70 | self.heap[0] = self.heap[self.currentPosition]
71 | del self.heap[self.currentPosition]
72 | self.fixDown(0, -1)
73 | return result
74 |
75 | def isFull(self):
76 | return self.currentPosition == Heap.HEAP_SIZE
77 |
78 |
79 | some_heap = Heap()
80 | some_heap.insert(12)
81 | some_heap.insert(-3)
82 | some_heap.insert(21)
83 | some_heap.insert(7)
84 | some_heap.insert(4)
85 | some_heap.heapSort()
86 |
--------------------------------------------------------------------------------
/hundred_without_int.py:
--------------------------------------------------------------------------------
1 | # Print numbers 1 to 100 without using any numbers or integers
2 |
3 | # APPRAOCH
4 | # Use Boolean values
5 |
6 | ONE = str(int(True))
7 | ZERO = str(int(False))
8 | HUNDRED = int(ONE + ZERO + ZERO)
9 |
10 | for i in range(int(ONE), HUNDRED+1):
11 | print(i, end=', ')
12 |
13 | # OUTPUT (Actual prints in the same line, line breaks given here for code clarity):
14 |
15 | # 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
16 | # 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
17 | # 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
18 | # 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
19 | # 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
20 | # 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
21 | # 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
22 | # 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
23 | # 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
24 | # 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
25 |
26 |
27 | # ALTERNATE APPROACH
28 | # ORD (or "ordinal") function in python gives the ASCII
29 | # (American Standard Code for information interchange) value
30 | # of characters ranging from 0-256 in 8 bits of memory which
31 | # is equal to a byte.
32 |
33 | ONE = str(ord('b') - ord('a'))
34 | ZERO = str(ord('a') - ord('a'))
35 | HUNDRED = int(ONE + ZERO + ZERO)
36 |
37 | for i in range(int(ONE), HUNDRED+1):
38 | print(i, end=', ')
39 |
--------------------------------------------------------------------------------
/interger_to_roman_num.py:
--------------------------------------------------------------------------------
1 | # Write a program to convert interger to Roman Representation
2 | def convertIntegerToRomanNum(input_num):
3 |
4 | number_list = [
5 | 1000, 900, 500, 400,
6 | 100, 90, 50, 40,
7 | 10, 9, 5, 4,
8 | 1
9 | ]
10 |
11 | symbol_list = [
12 | 'M', 'CM', 'D', 'CD',
13 | 'C', 'XC', 'L', 'XL',
14 | 'X', 'IX', 'V', 'IV',
15 | 'I'
16 | ]
17 |
18 | result_str = ''
19 | i = 0
20 |
21 | while input_num > 0:
22 | # if input_num//number_list[i] > 0 , then we will get a string appended
23 | # else it will continue the loop
24 | for _ in range(input_num//number_list[i]):
25 | result_str = result_str + symbol_list[i]
26 | input_num = input_num - number_list[i]
27 | i = i + 1
28 | return result_str
29 |
30 | roman = convertIntegerToRomanNum(289)
31 | print(roman)
32 |
33 | # ===============
34 | # Explaination:
35 | # ===============
36 | # 289
37 | # 289//1000 --> 0, i=1
38 | # 289//900 --> 0, i=2
39 | # 289//500 --> 0, i=3
40 | # 289//400 --> 0, i=4
41 | # 289//100 --> 2, then i=4, result_str = 'C', input_num = 189
42 | # 189//100 --> 1, then i=4, result_str='CC', input_num = 89
43 | # 89//100 --> 0, then i=5
44 | # 89//50 --> 1, then i=6, result_str='CCL', input_num= 39
45 | # 39//50 --> 0, then i=6
46 | # 39//40 --> 0, then i=7
47 | # 39//10 --> 3, then i=8, result_str='CCLX', input_num=29
48 | # 29//10 --> 3, then i=8, result_str='CCLXX', input_num=19
49 | # 19//10 --> 3, then i=8, result_str='CCLXXX', input_num=9
50 | # 9//10 --> 0, then i=9
51 | # 9//9 --> 1, then i=9, result_str='CCLXXXIX', input_num=0
52 | # EXIT
53 |
--------------------------------------------------------------------------------
/intersection_arrays.py:
--------------------------------------------------------------------------------
1 | # Problem: Given two sorted array of sizes m and n
2 | # in which all elements are distinct. Find the
3 | # common elements between them
4 | # Constraints: in O(m+n) complexity.
5 |
6 | def inr(z):
7 | return z + 1
8 |
9 | def intersectionArrays(x, y, m, n):
10 | i, j = 0, 0
11 | intersect_arr = []
12 |
13 | while i < m and j < n:
14 | # print(i, j)
15 | if x[i] == y[j]:
16 | intersect_arr.append(x[i])
17 | i, j = inr(i), inr(j)
18 | elif x[i] < y[j]:
19 | i = inr(i)
20 | else:
21 | j = inr(j)
22 | print(intersect_arr)
23 | return
24 |
25 | list_a = [1, 2, 3, 4, 5]
26 | list_b = [2, 3, 5, 6]
27 | intersectionArrays(list_a, list_b, len(list_a), len(list_b))
28 |
--------------------------------------------------------------------------------
/isMatrixSymmetric.py:
--------------------------------------------------------------------------------
1 | def isMatrixSymmetric(mat, size):
2 | for i in range(size):
3 | for j in range(size):
4 | if mat[i][j] != mat[j][i]:
5 | break
6 | return False
7 |
8 | return True
9 |
10 | ipMat = [[1,2,3],[2,4,5],[3,5,8]]
11 | result = isMatrixSymmetric(ipMat, len(ipMat))
12 | print(result)
13 |
--------------------------------------------------------------------------------
/is_anagram.py:
--------------------------------------------------------------------------------
1 | # Check if two strings are anagrams of each other
2 |
3 |
4 | def anagramise(word):
5 | d = dict()
6 |
7 | for char in word:
8 | if char not in d.keys():
9 | d[char] = 1
10 | else:
11 | d[char] += 1
12 |
13 | return d
14 |
15 |
16 | def is_anagram(str1, str2):
17 | return anagramise(str1) == anagramise(str2)
18 |
19 |
20 | result = is_anagram("hello", "billion")
21 | print(result) # False
22 |
--------------------------------------------------------------------------------
/is_anagram_using_collections.py:
--------------------------------------------------------------------------------
1 | # Check if two strings are anagrams of each other
2 |
3 |
4 | from collections import Counter
5 |
6 | def is_anagram(str1, str2):
7 | return Counter(str1) == Counter(str2)
8 |
9 |
10 | result = is_anagram("hello", "billion")
11 | print(result) # False
12 |
13 | result = is_anagram("million", "million")
14 | print(result) # True
15 |
--------------------------------------------------------------------------------
/is_num_palindrome.py:
--------------------------------------------------------------------------------
1 | # Write a program to check if a
2 | # number is a palindrome or not
3 |
4 | def is_num_palindrome(num):
5 | temp = str(num)
6 | i = 0
7 | j = len(temp) - 1
8 |
9 | while j > i:
10 | print(temp[i], temp[j])
11 | if temp[i] == temp[j]:
12 | i = i + 1
13 | j = j - 1
14 | else:
15 | return False
16 | return True
17 |
18 | n = 3456543
19 | res = is_num_palindrome(n)
20 | print(res)
21 |
--------------------------------------------------------------------------------
/is_numeric.py:
--------------------------------------------------------------------------------
1 | # Given a string, return True if it
2 | # is a numeric data type, False otherwise
3 |
4 |
5 | def is_numeric(input_str):
6 |
7 | data_types = [
8 | int,
9 | float,
10 | complex,
11 | lambda T: int(T, 2), # binary
12 | lambda T: int(T, 8), # octal
13 | lambda T: int(T, 16), # hex
14 | ]
15 |
16 | for dtype in data_types:
17 | try:
18 | dtype(input_str)
19 | return True
20 | except ValueError:
21 | pass
22 | return False
23 |
24 |
25 | tests = [
26 | "0",
27 | "0.",
28 | "00",
29 | "123",
30 | "0123",
31 | "+123",
32 | "-123",
33 | "-123.",
34 | "-123e-4",
35 | "-.8E-04",
36 | "0.123",
37 | "(5)",
38 | "-123+4.5j",
39 | "0b0101",
40 | " +0B101 ",
41 | "0o123",
42 | "-0xABC",
43 | "0x1a1",
44 | "12.5%",
45 | "1/2",
46 | "½",
47 | "3¼",
48 | "π",
49 | "Ⅻ",
50 | "1,000,000",
51 | "1 000",
52 | "- 001.20e+02",
53 | "NaN",
54 | "inf",
55 | "-Infinity",
56 | ]
57 |
58 | for s in tests:
59 | print(s, "---", is_numeric(s))
60 |
61 | """
62 | OUTPUT:
63 |
64 | 0 --- True
65 | 0. --- True
66 | 00 --- True
67 | 123 --- True
68 | 0123 --- True
69 | +123 --- True
70 | -123 --- True
71 | -123. --- True
72 | -123e-4 --- True
73 | -.8E-04 --- True
74 | 0.123 --- True
75 | (5) --- True
76 | -123+4.5j --- True
77 | 0b0101 --- True
78 | +0B101 --- True
79 | 0o123 --- True
80 | -0xABC --- True
81 | 0x1a1 --- True
82 | 12.5% --- False
83 | 1/2 --- False
84 | ½ --- False
85 | 3¼ --- False
86 | π --- False
87 | Ⅻ --- False
88 | 1,000,000 --- False
89 | 1 000 --- False
90 | - 001.20e+02 --- False
91 | NaN --- True
92 | inf --- True
93 | -Infinity --- True
94 | """
95 |
--------------------------------------------------------------------------------
/josephus.py:
--------------------------------------------------------------------------------
1 | # Problem: N soldiers are standing in a circle and
2 | # first person has sword and he kills the 2nd person
3 | # and gives the sword to the third person and so on
4 | # till 99th person kills the 100th person gives the
5 | # sword back to the first person, this goes on till
6 | # only one person survives. Print the survivor.
7 |
8 |
9 | def josephus(people, step=2):
10 | if step <= 1:
11 | print("Enter step value, greater than 1")
12 | else:
13 | step -= 1 # translated to zero-based indexing
14 | kill = step # kill will hold the index of current person to die
15 | while len(people) > 1:
16 | print(people.pop(kill)) # pop method removes the element from the list
17 | kill = (kill + step) % len(people)
18 | print(people[0], "is safe")
19 |
20 |
21 | num = int(input("Enter the number of soldiers: "))
22 | soldiers = [i for i in range(1, num + 1)] # generates a list of 1..num
23 | josephus(soldiers)
24 |
--------------------------------------------------------------------------------
/josephus_improved.py:
--------------------------------------------------------------------------------
1 | # The effective time complexity of the
2 | # improved version is O(logN). For the
3 | # problem statement, refer `josephus.py`
4 |
5 |
6 | def josephus_v2(people, step=2):
7 | if step <= 1:
8 | print("Enter step value, greater than 1")
9 | else:
10 | # len() method has O(1) time
11 | N = len(people) # caching the size of array
12 | p = 1
13 | # the loop runs for O(floor(logN)) time
14 | while p * 2 < N:
15 | p = p * 2
16 | # If N is a power of 2, should return 1. So let's check if L (N-p) is < p
17 | if N - p >= p:
18 | print(1)
19 | else:
20 | print((2 * (N - p)) + 1)
21 |
22 |
23 | num = int(input("Enter the number of soldiers: "))
24 | soldiers = [i for i in range(1, num + 1)] # generates a list of 1..num
25 | josephus_v2(soldiers)
26 |
--------------------------------------------------------------------------------
/josephus_improved_v3.py:
--------------------------------------------------------------------------------
1 | # The effective time complexity of this improved version is O(1).
2 | # For the problem statement, refer `josephus.py`
3 |
4 |
5 | def josephus_v3(soldiers):
6 | # Convert to binary.
7 | binary = bin(soldiers)
8 | # Get the first digit and put it as last.
9 | shift = "0b" + binary[3::] + binary[2:3:]
10 | # Convert to decimal.
11 | return int(shift, 2)
12 |
13 |
14 | winning = josephus_v3(soldiers=41)
15 | print(winning) # Winning Soldier: 19
16 |
17 | winning = josephus_v3(soldiers=100)
18 | print(winning) # Winning Soldier: 73
19 |
20 | winning = josephus_v3(soldiers=1000)
21 | print(winning) # Winning Soldier: 977
22 |
23 | # Testing:
24 | test_josephus = {
25 | 1: 1,
26 | 2: 1,
27 | 3: 3,
28 | 4: 1,
29 | 5: 3,
30 | 6: 5,
31 | 7: 7,
32 | 8: 1,
33 | 9: 3,
34 | 10: 5,
35 | 11: 7,
36 | 12: 9,
37 | 13: 11,
38 | 16: 1,
39 | 41: 19,
40 | }
41 | for soldiers, expected_winner in test_josephus.items():
42 | assert josephus_v3(soldiers=soldiers) == expected_winner
43 |
--------------------------------------------------------------------------------
/karatsuba.py:
--------------------------------------------------------------------------------
1 | import random
2 | from math import ceil
3 | from math import log10
4 |
5 |
6 | def get_digits(n):
7 | if n > 0:
8 | digits = int(log10(n)) + 1
9 | elif n == 0:
10 | digits = 1
11 | else:
12 | digits = int(log10(-n)) + 2
13 | return digits
14 |
15 |
16 | def karatsuba(x, y):
17 | # the base case for recursion
18 | if x < 10 and y < 10:
19 | return x * y
20 |
21 | # n is the number of digits in the highest input number
22 | n = max(get_digits(x), get_digits(y))
23 |
24 | n_2 = int(ceil(n / 2.0))
25 | n = n if n % 2 == 0 else n + 1
26 |
27 | # split the input numbers
28 | a, b = divmod(x, 10 ** n_2)
29 | c, d = divmod(y, 10 ** n_2)
30 |
31 | # applying the recursive steps
32 | ac = karatsuba(a, c)
33 | bd = karatsuba(b, d)
34 | ad_bc = karatsuba((a + b), (c + d)) - ac - bd
35 |
36 | # performs the multiplication
37 | z2 = (10 ** n) * ac
38 | z1 = (10 ** n_2) * ad_bc
39 | z0 = bd
40 | return z2 + z1 + z0
41 |
42 |
43 | def test():
44 | for i in range(1000):
45 | x = random.randint(1, 10 ** 5)
46 | y = random.randint(1, 10 ** 5)
47 | expected = x * y
48 | result = karatsuba(x, y)
49 | if result != expected:
50 | return print("failed")
51 | return print("ok")
52 |
53 |
54 | if __name__ == "__main__":
55 | test()
56 |
--------------------------------------------------------------------------------
/level_order_tree.py:
--------------------------------------------------------------------------------
1 | # This is a level order traversal of a
2 | # a binary tree. It is also known as breadth
3 | # first traversal.
4 |
5 |
6 | class Node:
7 | def __init__(self, data):
8 | self.data = data
9 | self.left = None
10 | self.right = None
11 |
12 |
13 | def printLevelOrder(root):
14 | height = getHeight(root)
15 | if root is None:
16 | return
17 | else:
18 | # iteratively print all the levels
19 | for i in range(1, height + 1):
20 | printGivenLevel(root, i)
21 |
22 |
23 | def printGivenLevel(root, level):
24 | if root is None:
25 | return
26 | if level == 1:
27 | print("%d" % root.data)
28 | elif level > 1:
29 | # recursively print the given level
30 | printGivenLevel(root.left, level - 1)
31 | printGivenLevel(root.right, level - 1)
32 |
33 |
34 | def getHeight(root):
35 | if root is None:
36 | return 0
37 | else:
38 | lheight = getHeight(root.left)
39 | rheight = getHeight(root.right)
40 |
41 | if lheight > rheight:
42 | return lheight + 1
43 | else:
44 | return rheight + 1
45 |
46 |
47 | root = Node(1)
48 | root.left = Node(2)
49 | root.right = Node(3)
50 | root.left.left = Node(4)
51 | root.left.right = Node(5)
52 |
53 | print("Level order traversal of binary tree is -")
54 | printLevelOrder(root)
55 |
--------------------------------------------------------------------------------
/linked_list_data_structure.py:
--------------------------------------------------------------------------------
1 | class Node:
2 | def __init__(self, data=None):
3 | self.data = data
4 | self.next_node = None
5 |
6 |
7 | class LinkedList:
8 | def __init__(self, head=None):
9 | self.head = head
10 |
11 | def isEmpty(self):
12 | return self.head == None
13 |
14 | def insert(self, data):
15 | # create a temp node
16 | temp = Node(data=data)
17 | # point new node to head
18 | temp.next_node = self.head
19 | # set the head as new node
20 | self.head = temp
21 |
22 | def insert_after(self, prev, data):
23 | if prev is None:
24 | raise ValueError("Given node is not found...")
25 | return prev
26 |
27 | # create a temp node
28 | temp = Node(data=data)
29 | # set next node of temp to the next node of previous
30 | temp.next_node = prev.next_node
31 | # set next node of previous to point temp
32 | prev.next_node = temp
33 |
34 | def size(self):
35 | # start with the head
36 | current = self.head
37 | count = 0
38 |
39 | # loop unless current is not None
40 | while current:
41 | count += 1
42 | current = current.next_node
43 | return count
44 |
45 | def search(self, data):
46 | # start with the head
47 | current = self.head
48 | found = False
49 |
50 | # loop unless current is not None
51 | while current and not found:
52 | # if found, change flag and return data
53 | if current.data == data:
54 | found = True
55 | else:
56 | # change current to next node
57 | current = current.next_node
58 | if current is None:
59 | # raise Exception if not found
60 | raise ValueError("Data is not in the list")
61 | return current
62 |
63 | def delete(self, data):
64 | # start with the head
65 | current = self.head
66 | previous = None
67 | found = False
68 |
69 | # loop unless current is not None
70 | while current and not found:
71 | # if found, change flag
72 | if current.getData() == data:
73 | found = True
74 | else:
75 | previous = current
76 | current = current.next_node
77 |
78 | if current is None:
79 | # raise Exception if not found
80 | raise ValueError("Data is not in the list")
81 | if previous is None:
82 | self.head = current.next_node
83 | else:
84 | previous.next_node = current.next_node
85 |
--------------------------------------------------------------------------------
/loop_in_linkedlist.py:
--------------------------------------------------------------------------------
1 | from linked_list_data_structure import LinkedList
2 | from find_m_to_last_llist import findMToLast
3 |
4 |
5 | def hasLoop(l_list):
6 | fast = l_list.head.next_node
7 | slow = l_list.head
8 | hasLoop = False
9 |
10 | while True:
11 | if fast or fast.next_node:
12 | pass
13 | elif fast == slow or fast.next_node == slow:
14 | hasLoop = True
15 | else:
16 | fast = fast.next_node
17 | slow = slow.next_node
18 |
19 | return hasLoop
20 |
21 |
22 | linked_list = LinkedList()
23 | # Returns the third element from last
24 | print(findMToLast(linked_list))
25 |
--------------------------------------------------------------------------------
/lowest_common_ancestor.py:
--------------------------------------------------------------------------------
1 | def findLowestCommonAncestor(root, value1, value2):
2 | while root is not None:
3 | value = root.value
4 | if value > value1 and value > value2:
5 | root = root.left
6 | elif value < value1 and value < value2:
7 | root = root.right
8 | else:
9 | return root
10 |
--------------------------------------------------------------------------------
/majority_element.py:
--------------------------------------------------------------------------------
1 | # Problem: A majority element in an array A[] of
2 | # size n is an element that appears more than n/2
3 | # times. Find the majority element in the given array.
4 |
5 | # Returns the elements whose frequency is more than n/2
6 | def findMajorityElement(elements, N, found=False):
7 | keys = [int(i) for i in elements.keys()]
8 | for i in keys:
9 | if elements[i] > N // 2:
10 | found = True
11 | print(i)
12 | if not found:
13 | print("Majority element not found")
14 |
15 |
16 | # Creates a hash of frequency of numbers
17 | def mapFrequency(arr):
18 | FREQUENCY = {}
19 | for i in arr:
20 | if i in FREQUENCY.keys():
21 | FREQUENCY[i] += 1
22 | else:
23 | FREQUENCY[i] = 0
24 | return findMajorityElement(FREQUENCY, len(arr))
25 |
26 |
27 | arr = [1, 2, 4, 4, 4, 4, 4]
28 | mapFrequency(arr)
29 |
--------------------------------------------------------------------------------
/max_in_array.py:
--------------------------------------------------------------------------------
1 | def maxIndex(arr):
2 | max_index = 0
3 | for i in range(1, len(arr)):
4 | if arr[max_index] < arr[i]:
5 | max_index = i
6 | return max_index
7 |
8 | arr = [4,5,6,7,8,1,2,11,12,13,3,9,10]
9 | res = maxIndex(arr)
10 | print("maximum element is", arr[res],"at index:", res)
11 |
--------------------------------------------------------------------------------
/maximum_subarray_sum.py:
--------------------------------------------------------------------------------
1 | # Problem: Given a list of positive and negative numbers, find the maximum subarray sum.
2 | # Constraint: Solve it in O(n)
3 |
4 | # Solution: Use two variables to hold sums
5 | # a. overall sum -- initialize to first element
6 | # b. partial sum -- initialize to first element
7 | # Traverse over the whole array
8 | # If the current element is greater than partial sum, swap the partial sum with the current element
9 | # If the partial sum is greater than overall sum, swap overall with partial sum
10 | # the overall sum will be the contiguous subarray with the largest sum
11 |
12 | def maxSubArraySum(arr):
13 | max_so_far = arr[0]
14 | current_max = arr[0]
15 |
16 | for i in range(1,len(arr)):
17 | current_max = max(arr[i], current_max + arr[i])
18 | max_so_far = max(max_so_far, current_max)
19 |
20 | return max_so_far
21 |
22 | sampleArr = [-2, -3, 4, -1, -2, 1, 5, -3]
23 |
24 | solution = maxSubArraySum(sampleArr)
25 | print(solution)
26 |
--------------------------------------------------------------------------------
/merge_sort.py:
--------------------------------------------------------------------------------
1 | def merge_sort(array):
2 | # derive the mid-point
3 | if len(array) > 1:
4 | mid = len(array) // 2
5 |
6 | # create the temp sub-arrays
7 | LEFT = array[:mid]
8 | RIGHT = array[mid:]
9 |
10 | # sort the first and second halves
11 | merge_sort(LEFT)
12 | merge_sort(RIGHT)
13 |
14 | # begin addig elements in sorted order
15 | i, j, k = 0, 0, 1
16 |
17 | while i < len(LEFT) and j < len(RIGHT):
18 | if LEFT[i] < RIGHT[j]:
19 | array[k] = LEFT[i]
20 | i += 1
21 | else:
22 | array[k] = RIGHT[j]
23 | j += 1
24 | k += 1
25 |
26 | # copy the remaining data
27 | while i < len(LEFT):
28 | array[k] = LEFT[i]
29 | i += 1
30 | k += 1
31 |
32 | while j < len(RIGHT):
33 | array[k] = RIGHT[j]
34 | j += 1
35 | k += 1
36 |
37 |
38 | arr = [6, 5, 3, 1, 8, 7, 2, 4]
39 | merge_sort(arr)
40 | print(arr)
41 |
--------------------------------------------------------------------------------
/min_max_array_oneLoop.py:
--------------------------------------------------------------------------------
1 | def maxMinIndex(arr):
2 |
3 | max_index = 0
4 | min_index = 0
5 |
6 | # using two counters in single for loop
7 | for i, j in zip(range(1, len(arr)), range(1, len(arr))):
8 | if arr[max_index] < arr[i]:
9 | max_index = i
10 | if arr[min_index] > arr[i]:
11 | min_index = i
12 |
13 | return max_index, min_index
14 |
15 | arr = [4,5,6,7,8,1,2,11,12,13,3,9,10]
16 | res = maxMinIndex(arr)
17 | print("maximum element is", arr[res[0]],"at index:", res[0])
18 | print("minimum element is", arr[res[1]],"at index:", res[1])
19 |
--------------------------------------------------------------------------------
/move_zeros_to_end.py:
--------------------------------------------------------------------------------
1 | # Given an array of integers we need to move
2 | # all the zeroes to the end and maintain the
3 | # order of rest of the elements. Needless to
4 | # say it should be an in-place solution
5 |
6 | def move_zero_to_end(arr):
7 | count = 0
8 |
9 | for i in arr:
10 | if i != 0:
11 | arr[count] = i
12 | count = count + 1
13 |
14 | for i in range(count, len(arr)):
15 | arr[i] = 0
16 |
17 | return arr
18 |
19 |
20 |
21 |
22 | array = [1, 9, 8, 4, 0, 0, 2, 7, 0, 6, 0, 9]
23 | res = move_zero_to_end(array)
24 | print(res)
25 |
--------------------------------------------------------------------------------
/no_sibling_tree.py:
--------------------------------------------------------------------------------
1 | # Problem: Print the nodes of a binary tree
2 | # which do not have a sibling
3 |
4 |
5 | class Node:
6 | def __init__(self, data):
7 | self.data = data
8 | self.left = None
9 | self.right = None
10 |
11 |
12 | def printSingleNode(root, hasSibling):
13 | # hasSibling will check if root has both children
14 | if root is None:
15 | return
16 | else:
17 | # if root has one child, print that child data
18 | if not hasSibling:
19 | print("%d" % root.data)
20 |
21 | printSingleNode(root.left, root.right is not None)
22 | printSingleNode(root.right, root.left is not None)
23 |
24 |
25 | root = Node(1)
26 | root.left = Node(2)
27 | root.right = Node(3)
28 | root.left.left = Node(4)
29 | root.right.left = Node(5)
30 | root.right.left.left = Node(6)
31 |
32 | print("Level order traversal of binary tree is -")
33 | printSingleNode(root, True)
34 |
--------------------------------------------------------------------------------
/oddAscEvenDesc.py:
--------------------------------------------------------------------------------
1 | # Let all odd numbers come before even numbers,
2 | # and sort the odd numbers in ascending order and
3 | # even numbers in descending order.
4 | # For example, the string '1982376455' becomes '1355798642'
5 |
6 | def oddAscEvenDesc(inputStr):
7 | oddSubstr = ''
8 | evenSubstr = ''
9 |
10 | for char in inputStr:
11 | if int(char) % 2 == 0:
12 | evenSubstr += char
13 | else:
14 | oddSubstr += char
15 |
16 | temp = sorted(oddSubstr) + sorted(evenSubstr, reverse=True)
17 |
18 | return "".join(temp)
19 |
20 | x = "978231456"
21 | y = oddAscEvenDesc(x)
22 | print(y)
23 |
--------------------------------------------------------------------------------
/pascal_triangle.py:
--------------------------------------------------------------------------------
1 | # Recusrive method to create the series
2 | def computePascal(col, row):
3 | # There are three things to compute
4 | # 1. Left edge: col is 0
5 | # 2. Right edge: col is same as row
6 | if col == row or col == 0:
7 | return 1
8 | # 3. any other cell: col-1 + col of the previous row
9 | else:
10 | return computePascal(col - 1, row - 1) + computePascal(col, row - 1)
11 |
12 |
13 | # Method to create the triangle for `N` row
14 | def printTriangle(num):
15 | for r in range(num):
16 | # upon observation, we can deduce the relation
17 | # num_cols = num_rows + 1
18 | for c in range(r + 1):
19 | print(str(computePascal(c, r)), end=" ")
20 | print("\n")
21 |
22 |
23 | printTriangle(10)
24 | """
25 | Output:
26 | 1
27 |
28 | 1 1
29 |
30 | 1 2 1
31 |
32 | 1 3 3 1
33 |
34 | 1 4 6 4 1
35 |
36 | 1 5 10 10 5 1
37 |
38 | 1 6 15 20 15 6 1
39 |
40 | 1 7 21 35 35 21 7 1
41 |
42 | 1 8 28 56 70 56 28 8 1
43 |
44 | 1 9 36 84 126 126 84 36 9 1
45 | """
46 |
--------------------------------------------------------------------------------
/pascals_triangle_improved.py:
--------------------------------------------------------------------------------
1 | # using factorial, reduced the time complexity
2 | # of program from O(2^N) to O(N)
3 |
4 |
5 | def factorial(n):
6 | if n < 2:
7 | return 1
8 | else:
9 | return n * factorial(n - 1)
10 |
11 |
12 | def computeCoefficient(col, row):
13 | return factorial(row) // (factorial(col) * factorial(row - col))
14 |
15 |
16 | # Recusrive method to create the series
17 | def computePascal(col, row):
18 | if col == row or col == 0:
19 | return 1
20 | else:
21 | return computeCoefficient(col, row)
22 |
23 |
24 | # Method to create the triangle for `N` row
25 | def printTriangle(num):
26 | for r in range(num):
27 | for c in range(r + 1):
28 | print(str(computePascal(c, r)), end=" ")
29 | print("\n")
30 |
31 |
32 | printTriangle(10)
33 | """
34 | Output:
35 | 1
36 |
37 | 1 1
38 |
39 | 1 2 1
40 |
41 | 1 3 3 1
42 |
43 | 1 4 6 4 1
44 |
45 | 1 5 10 10 5 1
46 |
47 | 1 6 15 20 15 6 1
48 |
49 | 1 7 21 35 35 21 7 1
50 |
51 | 1 8 28 56 70 56 28 8 1
52 |
53 | 1 9 36 84 126 126 84 36 9 1
54 | """
55 |
--------------------------------------------------------------------------------
/permutations.py:
--------------------------------------------------------------------------------
1 | def permutations(word):
2 | if len(word) == 1:
3 | return [word]
4 | else:
5 | result = []
6 | for p in permutations(word[1:]):
7 | print(p, word[1:])
8 | print("\n")
9 | for i in range(len(word)):
10 | print(i, "1" + p[:i], "2" + word[0:1], "3" + p[i:])
11 | current_p = p[:i] + word[0:1] + p[i:]
12 | result.append(current_p)
13 |
14 | return result
15 |
16 |
17 | given_input = "bc"
18 | print(permutations(given_input))
19 |
--------------------------------------------------------------------------------
/permute_strings.py:
--------------------------------------------------------------------------------
1 | def permute(s, result):
2 | if len(s) == 0:
3 | print(result, end=" ")
4 | return
5 |
6 | for i in range(len(s)):
7 | char = s[i]
8 | left_str = s[0: i]
9 | right_str = s[i+1: ]
10 |
11 | other_str = left_str + right_str
12 | permute(other_str, result + char)
13 |
14 | permute("naruto", "")
15 |
16 |
--------------------------------------------------------------------------------
/preorder_iterative_bst.py:
--------------------------------------------------------------------------------
1 | def preOrderTraversal(root):
2 | stack = []
3 | stack.insert(0, root)
4 | while len(stack) > 0:
5 | current = stack.pop()
6 | print(current.value)
7 |
8 | right = current.right
9 | if right is not None:
10 | stack.insert(0, right)
11 |
12 | left = current.left
13 | if left is not None:
14 | stack.insert(0, left)
15 |
--------------------------------------------------------------------------------
/priority_queue_simple.py:
--------------------------------------------------------------------------------
1 | # Priority Queue is an extension of the queue with following properties.
2 | # 1) An element with high priority is dequeued before an element with low priority.
3 | # 2) If two elements have the same priority, they are served according to their order in the queue
4 | # The delete operation has time complexity of O(n)
5 |
6 | class PriorityQueue(object):
7 | def __init__(self):
8 | self.queue = []
9 |
10 | def __str__(self):
11 | return ' '.join([str(i) for i in self.queue])
12 |
13 | # for checking if the queue is empty
14 | def isEmpty(self):
15 | return len(self.queue) == 0
16 |
17 | # for inserting an element in the queue
18 | def insert(self, data):
19 | self.queue.append(data)
20 |
21 | # for popping an element based on Priority
22 | def delete(self):
23 | try:
24 | max = 0
25 |
26 | for i in range(len(self.queue)):
27 | if self.queue[max] < self.queue[i]:
28 | max = i
29 |
30 | item = self.queue[max]
31 | del self.queue[max]
32 |
33 | return item
34 |
35 | except IndexError:
36 | print()
37 | sys.exit()
38 |
39 | myQueue = PriorityQueue()
40 | myQueue.insert(12)
41 | myQueue.insert(1)
42 | myQueue.insert(14)
43 | myQueue.insert(7)
44 |
45 | print(myQueue)
46 |
47 | while not myQueue.isEmpty():
48 | print(myQueue.delete())
49 |
50 |
--------------------------------------------------------------------------------
/processStringToDict.py:
--------------------------------------------------------------------------------
1 | # Process the string "k:1 |k1:2|k2:3|k3:4" into a dictionary {k:1,k1:2,...}
2 |
3 | StringToProcess = "k:1 |k1:2|k2:3|k3:4"
4 |
5 | d2 = dict()
6 | keyvalue_list = StringToProcess.split('|') # ['k:1' , 'k1:2' , 'k2:3' ,'k3:4']
7 |
8 | for keyval in keyvalue_list:
9 | k,v = keyval.split(':') # (k,1) , (k1,2) , (k2,3) ,(k3,4)
10 | d2[k] = v
11 |
12 |
13 | print(d2) # {'k': '1 ', 'k1': '2', 'k2': '3', 'k3': '4'}
14 |
--------------------------------------------------------------------------------
/product_puzzle.py:
--------------------------------------------------------------------------------
1 | # Problem: Given an array arr[] of n integers,
2 | # construct a Product Array prod[] (of same size)
3 | # such that prod[i] is equal to the product of all
4 | # the elements of arr[] except arr[i].
5 |
6 | # Constraints: Solve it without division operator and in O(n)
7 |
8 |
9 | def computeProductArray(array, size):
10 | # initialize three arrays of the same size as given array
11 | # Left: will hold the product of all the elements to the left
12 | # Right: will hold the product of all the elements to the right
13 | # Product: contains the product value for current element
14 |
15 | prod = [1] * size
16 | left = [1] * size
17 | right = [1] * size
18 |
19 | for i in range(1, size):
20 | left[i] = array[i - 1] * left[i - 1]
21 | # decreasing loop in Python {start, end, step(-ve)}
22 | # equivalent to (j=size-2; j>=0; j--)
23 | for j in range(size - 2, -1, -1):
24 | right[j] = array[j + 1] * right[j + 1]
25 | for k in range(0, size):
26 | prod[k] = left[k] * right[k]
27 | print(prod)
28 |
29 |
30 | arr = [10, 3, 5, 6, 2]
31 | computeProductArray(arr, len(arr))
32 | # Result: [180, 600, 360, 300, 900]
33 |
--------------------------------------------------------------------------------
/queue_data_structure.py:
--------------------------------------------------------------------------------
1 | class Queue:
2 | def __init__(self):
3 | self.queue = []
4 |
5 | def enqueue(self, value):
6 | self.queue.insert(0, value)
7 |
8 | def dequeue(self):
9 | self.queue.pop()
10 |
11 | def isEmpty(self):
12 | return self.size() == 0
13 |
14 | def size(self):
15 | return len(self.queue)
16 |
--------------------------------------------------------------------------------
/quick_sort.py:
--------------------------------------------------------------------------------
1 | def quick_sort(arr):
2 | quick_sort_helper(arr, 0, len(arr) - 1)
3 |
4 |
5 | def quick_sort_helper(arr, first, last):
6 | if first < last:
7 | pi = partition(arr, first, last)
8 |
9 | quick_sort_helper(arr, first, pi - 1)
10 | quick_sort_helper(arr, pi + 1, last)
11 |
12 |
13 | def partition(arr, first, last):
14 | pivot = arr[first]
15 |
16 | left = first + 1
17 | right = last
18 |
19 | done = False
20 | while not done:
21 | while left <= right and arr[left] <= pivot:
22 | left += 1
23 |
24 | while arr[right] >= pivot and right >= left:
25 | right -= 1
26 |
27 | if right < left:
28 | done = True
29 | else:
30 | arr[left], arr[right] = arr[right], arr[left]
31 |
32 | arr[first], arr[right] = arr[right], arr[first]
33 |
34 | return right
35 |
36 |
37 | alist = [54, 26, 93, 17, 77, 31, 44, 55, 20]
38 | quick_sort(alist)
39 | print(alist) # [17, 20, 26, 31, 44, 54, 55, 77, 93]
40 |
--------------------------------------------------------------------------------
/range_fn_float.py:
--------------------------------------------------------------------------------
1 | # Make a range function that works for `float` inputs
2 |
3 | def float_for(start, stop, increment, stop_inclusive=True):
4 | if stop_inclusive:
5 | stop += increment
6 |
7 | while start < stop:
8 | # The yield statement returns a `generator` object to
9 | # the one who calls the function which contains yield,
10 | # instead of simply returning a value.
11 | yield start
12 | start += increment
13 |
14 |
15 | for i in float_for(0.5, 0.95, 0.05):
16 | print(i)
17 |
18 | """
19 | Output:
20 |
21 | 0.5
22 | 0.55
23 | 0.6000000000000001
24 | 0.6500000000000001
25 | 0.7000000000000002
26 | 0.7500000000000002
27 | 0.8000000000000003
28 | 0.8500000000000003
29 | 0.9000000000000004
30 | 0.9500000000000004
31 | """
32 |
--------------------------------------------------------------------------------
/remove_chars.py:
--------------------------------------------------------------------------------
1 | # Write an efficient function that deletes characters from an ASCII
2 | # string where any character existing in remove must be deleted from
3 | # str. For example, given a str of "Battle of the Vowels: Hawaii vs.
4 | # Grozny" and a remove of "aeiou", the function should transform str
5 | # to “Bttl f th Vwls: Hw vs. Grzny”.
6 |
7 |
8 | def removeChars(main_string, remove_string):
9 | result = ""
10 | for char in main_string:
11 | if char not in remove_string:
12 | result += char
13 | return result
14 |
15 |
16 | given_input = "Battle of the Vowels: Hawaii vs. Grozny"
17 | vowels = "aeiou"
18 | print(removeChars(given_input, vowels))
19 |
--------------------------------------------------------------------------------
/remove_dup_chars.py:
--------------------------------------------------------------------------------
1 | # Remove duplicate characters from string
2 |
3 |
4 | def remove_dup_chars(input_str):
5 | dedupe_str = ""
6 |
7 | for char in input_str:
8 | if char not in dedupe_str:
9 | dedupe_str += char
10 |
11 | return dedupe_str
12 |
13 |
14 | result = remove_dup_chars("zmaxxazkgv")
15 | print(result) # zmaxxazkgv
16 |
--------------------------------------------------------------------------------
/remove_duplicates.py:
--------------------------------------------------------------------------------
1 | def remove_duplicates(arr):
2 | return list(dict.fromkeys(arr))
3 |
4 |
5 | result = remove_duplicates([0, 0, 0, 1, 1, 2, 2, 3, 4, 5])
6 | print(result) # [0, 1, 2, 3, 4, 5]
7 |
--------------------------------------------------------------------------------
/remove_duplicates_v2.py:
--------------------------------------------------------------------------------
1 | def remove_duplicates_v2(arr):
2 | dedupe_arr = []
3 |
4 | for i in arr:
5 | if i not in dedupe_arr:
6 | dedupe_arr.append(i)
7 |
8 | return dedupe_arr
9 |
10 |
11 | result = remove_duplicates_v2([0, 0, 0, 1, 1, 2, 2, 3, 4, 5])
12 | print(result)
13 |
--------------------------------------------------------------------------------
/reverse_in_place.py:
--------------------------------------------------------------------------------
1 | def reverse_in_place(arr):
2 | i = 0
3 | j = len(arr) - 1
4 |
5 | while i != j and i < j:
6 | arr[i], arr[j] = arr[j], arr[i]
7 | i += 1
8 | j -= 1
9 |
10 | return arr
11 |
12 |
13 | result = reverse_in_place([4, 12, 14, 16, 18])
14 | print(result)
15 |
--------------------------------------------------------------------------------
/reverse_str_recursive.py:
--------------------------------------------------------------------------------
1 | # Given a string, get it reversed using recursion
2 |
3 |
4 | def recursive_reverse(input_str):
5 | if len(input_str) == 0:
6 | return ""
7 | else:
8 | return recursive_reverse(input_str[1:]) + input_str[0]
9 |
10 |
11 | result = recursive_reverse("aabbcc")
12 | print(result) # ccbbaa
13 |
--------------------------------------------------------------------------------
/reverse_words.py:
--------------------------------------------------------------------------------
1 | def reverseWords(sentence):
2 | stack = []
3 | words = sentence.split()
4 | for word in words:
5 | stack.insert(0, word)
6 | return " ".join(word for word in stack)
7 |
8 |
9 | given_input = "Do or do not, there is no try."
10 | print(reverseWords(given_input))
11 |
--------------------------------------------------------------------------------
/rotateMatrix180Deg.py:
--------------------------------------------------------------------------------
1 | def rotateMatrixby90(ipMat, size):
2 | opMat = [[0 for i in range(size)] for j in range(size)]
3 |
4 | for i in range(size):
5 | for j in range(size):
6 | opMat[j][i] = ipMat[i][j]
7 |
8 | return opMat
9 |
10 | def reverseMatrix(ipMat, size):
11 | opMat = [[0 for i in range(size)] for j in range(size)]
12 | for i in range(size):
13 | for j in range(size):
14 | opMat[abs(i-(size-1))][j] = ipMat[i][j]
15 |
16 | return opMat
17 |
18 | def rotateMatrixby180(ipMat, size):
19 | mat_1 = rotateMatrixby90(ipMat, size)
20 | mat_2 = reverseMatrix(mat_1, len(mat_1))
21 | mat_3 = rotateMatrixby90(mat_2, len(mat_2))
22 | mat_4 = reverseMatrix(mat_3, len(mat_3))
23 |
24 | return mat_4
25 |
26 | def printMatrix(ipMat, size):
27 | for i in range(size):
28 | for j in range(size):
29 | print(ipMat[i][j], end=" ")
30 | print('\n')
31 |
32 | matA = [[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]]
33 | print("Original-Matrix" + '\n')
34 | printMatrix(matA, len(matA))
35 |
36 | print("Rotated-Matrix" + '\n')
37 | rotatedMat = rotateMatrixby90(matA, len(matA))
38 | printMatrix(rotatedMat, len(rotatedMat))
39 |
40 | matB = [[1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15], [4, 8, 12, 16]]
41 | reverseMat = reverseMatrix(matB, len(matB))
42 | print("Reverse-Matrix" + '\n')
43 | printMatrix(reverseMat, len(reverseMat))
44 |
45 | print("Rotated-180-Matrix" + '\n')
46 | rotatedMat180 = rotateMatrixby180(matA, len(matA))
47 | printMatrix(rotatedMat180, len(rotatedMat180))
48 |
--------------------------------------------------------------------------------
/rotate_matrix.py:
--------------------------------------------------------------------------------
1 | # Problem: Rotate a square matrix by
2 | # 90 degree with O(1) extra space
3 | def rotate_by_90(m):
4 | # unpacking arguments with zip(*) in reverse with [ : :-1]
5 | tuples = zip(*m[::-1])
6 | # flattening tuples to list with [list(i)]
7 | return [list(i) for i in tuples]
8 |
9 |
10 | def makeMatrix(array, size):
11 | # validating size of matrix for given array
12 | if size ** 2 != len(array):
13 | return -1
14 | # make sub array of length size using array slicing
15 | else:
16 | matrix = [array[i : i + size] for i in range(0, len(array), size)]
17 | return rotate_by_90(matrix)
18 |
19 |
20 | arr = [1, 2, 3, 4]
21 | dimension = 2
22 | result = makeMatrix(arr, dimension)
23 | # Original Matrix: [[1, 2], [3, 4]]
24 | # Result: [[3, 1], [4, 2]]
25 |
--------------------------------------------------------------------------------
/running_median_integers.py:
--------------------------------------------------------------------------------
1 | # Problem: Find Median from Data Stream
2 |
3 | def findMedian(stream):
4 | # print(stream)
5 | streamSize = len(stream)
6 |
7 | if streamSize == 1:
8 | return stream[0]
9 | else:
10 | stream = sorted(stream)
11 | midPt = streamSize // 2
12 |
13 | if streamSize % 2 == 1:
14 | return stream[midPt]
15 | else:
16 | return (stream[midPt] + stream[midPt-1]) // 2
17 |
18 | def findRunningMedian(inputArray):
19 | medianArray = []
20 |
21 | for i in range(0,len(inputArray)):
22 | # print(inputArray)
23 | currentMedian = findMedian(inputArray[0:i+1])
24 | medianArray.append(currentMedian)
25 |
26 | return medianArray
27 |
28 |
29 | solution = findRunningMedian([1, 2, 3, 4, 5])
30 | print(solution)
31 |
--------------------------------------------------------------------------------
/search_unique.py:
--------------------------------------------------------------------------------
1 | # Problem: Given a sorted array in which all elements
2 | # appear twice (one after one) and one element
3 | # appears only once. Find that element
4 | # Constraints: in O(log n) complexity.
5 | def searchUnique(arr, low, high):
6 | # Base Cases
7 | # 1. low is greater than high
8 | # 2. Array with single element
9 | if low > high:
10 | return None
11 | if low == high:
12 | return arr[low]
13 | # Find the middle element
14 | mid = low + (high - low) // 2
15 | # if the middle element lies at even place,
16 | # check i and i+1, if they are same, go to right else left
17 | if mid % 2 == 0:
18 | if arr[mid] == arr[mid + 1]:
19 | return searchUnique(arr, mid + 2, high)
20 | else:
21 | return searchUnique(arr, low, mid)
22 | # if the middle element lies at odd place,
23 | # check i-1 and i, if they are same, go to right else left
24 | # Replace mid by mid-1 for above block
25 | else:
26 | if arr[mid - 1] == arr[mid]:
27 | return searchUnique(arr, mid + 1, high)
28 | else:
29 | return searchUnique(arr, low, mid - 1)
30 |
31 |
32 | array = [1, 1, 2, 4, 4, 5, 5, 6, 6]
33 | result = searchUnique(array, 0, len(array) - 1)
34 | print(result)
35 | # Result: 2
36 |
--------------------------------------------------------------------------------
/selection_sort.py:
--------------------------------------------------------------------------------
1 | # A simple implementation of Selection Sort
2 |
3 | def selectionSort(arr):
4 | i = 0
5 | while i < len(arr):
6 | min_index = i
7 | for j in range(i+1, len(arr)):
8 | if arr[j] < arr[min_index]:
9 | min_index = j
10 |
11 | arr[i], arr[min_index] = arr[min_index], arr[i]
12 | i = i + 1
13 |
14 | return arr
15 |
16 | arr = [2, 6, 1, 5, 3, 4]
17 | res = selectionSort(arr)
18 | print(res)
19 |
20 |
--------------------------------------------------------------------------------
/signOfProduct.py:
--------------------------------------------------------------------------------
1 | # Problem: Given an array arr[] of n integers,
2 | # the integers can be positive, negative or 0
3 | # return the sign of the product of the elements
4 | # 1 : positive
5 | # -1 : negative
6 | # 0 : zero
7 |
8 |
9 | def getSignOfProduct(array):
10 |
11 | sign = 1
12 |
13 | for num in array:
14 |
15 | if num == 0:
16 | return 0
17 |
18 | if num < 0:
19 | sign = -1 * sign
20 |
21 | return sign
22 |
23 |
24 | arr = [10, 45, -9, 3, -4, -5, 7, 32 , 0, 12 , 45, -1]
25 | res = getSignOfProduct(arr)
26 | print(arr, res)
27 |
28 | # Result: [10, 45, -9, 3, -4, -5, 7, 32, 0, 12, 45, -1] 0
29 |
--------------------------------------------------------------------------------
/stack_data_structure.py:
--------------------------------------------------------------------------------
1 | class Stack:
2 | def __init__(self):
3 | self.stack = []
4 |
5 | def push(self, value):
6 | self.stack.append(value)
7 |
8 | def pop(self):
9 | if self.isEmpty():
10 | print("Stack underflow...")
11 | return None
12 | else:
13 | self.stack.pop()
14 |
15 | def size(self):
16 | return len(self.stack)
17 |
18 | def isEmpty(self):
19 | return self.size() == 0
20 |
21 | def peek(self):
22 | if self.isEmpty():
23 | return None
24 | else:
25 | return self.stack[-1]
26 |
--------------------------------------------------------------------------------
/stock_span.py:
--------------------------------------------------------------------------------
1 | # Context : The span Si of the stock’s price on a given day i
2 | # is defined as the maximum number of consecutive days just before
3 | # the given day, for which the price of the stock on the current
4 | # day is less than or equal to its price on the given day.
5 |
6 | # Problem: We have a series of n daily price quotes for a stock
7 | # and we need to calculate span of stock’s price for all n days
8 |
9 |
10 | def calculate_span(stock_quotes, span):
11 | # span for the first quote will always be 1
12 | span[0] = 1
13 | for i in range(1, len(stock_quotes), 1):
14 | # initialize span value to be 1 for each ith quote
15 | span[i] = 1
16 | # scan for all the quotes to the left
17 | j = i - 1
18 | # if the preceeding quote has a value less than or equal to current quote
19 | # increase the span value of the current quote
20 | while j >= 0 and stock_quotes[i] >= stock_quotes[j]:
21 | span[i] = span[i] + 1
22 | j = j - 1
23 | return span
24 |
25 |
26 | quotes = [10, 4, 5, 90, 120, 80]
27 | # initialize span as an empty list with same length as quotes
28 | span_list = [None] * len(quotes)
29 | print(calculate_span(quotes, span_list))
30 | # Result : [1, 1, 2, 4, 5, 1]
31 |
--------------------------------------------------------------------------------
/sum_array_recursion.py:
--------------------------------------------------------------------------------
1 | # Write a program to sum a given array
2 | # using recursion
3 |
4 | def recur_sum(arr, i):
5 | #print(arr[i])
6 | if i < 1:
7 | return arr[i]
8 | else:
9 | return arr[i] + recur_sum(arr, i-1)
10 |
11 |
12 | arr = [-1, 2, -3, 4, 5]
13 | print(recur_sum(arr, len(arr)-1))
14 |
--------------------------------------------------------------------------------
/timeseries.py:
--------------------------------------------------------------------------------
1 | # Libraries Included:
2 | # Numpy, Scipy, Scikit, Pandas
3 |
4 | import pandas as pd
5 | #print("Hello, world!")
6 |
7 | def get_merged(series1, series2):
8 | merged = []
9 | temp = {}
10 |
11 | for i in series1:
12 | if i[0] not in temp.keys():
13 | temp[i[0]] = i[1]
14 | else:
15 | curr = temp.get(i[0])
16 | temp[i[0]] = (curr + i[1]) / 2
17 |
18 | for i in series2:
19 | if i[0] not in temp.keys():
20 | temp[i[0]] = i[1]
21 | else:
22 | curr = temp.get(i[0])
23 | temp[i[0]] = (curr + i[1]) / 2
24 |
25 | #print(temp)
26 | ordered = sorted(temp.items(), key=temp.items[1], reverse=False)
27 | #print(ordered)
28 |
29 | #for k,v in ordered.items():
30 | # merged.append((k,v))
31 |
32 | return ordered
33 |
34 | # timeseries data
35 | series1 = [
36 | ('2010-01-01', 34),
37 | ('2010-01-02', 27),
38 | ('2010-01-04', 58),
39 | ('2010-01-05', 22)]
40 |
41 | series2 = [
42 | ('2010-01-01', 15),
43 | ('2010-01-03', 39),
44 | ('2010-01-05', 23),
45 | ('2010-01-06', 47)]
46 |
47 | res = get_merged(series1, series2)
48 | print(res)
49 |
50 | # merged = [
51 | # ('2010-01-01', 34),
52 | # ('2010-01-02', 27),
53 | # ('2010-01-03', 39),
54 | # ('2010-01-04', 58),
55 | # ('2010-01-05', 22.5),
56 | # ('2010-01-06', 47)]
57 |
58 | # input array of series
59 | # subfunc(inp_ser, temp) -> updated_temp
60 | # keep track of series encounteered
61 |
62 |
63 | # freq_dict -> (date, counter)
64 |
65 | for i in series1:
66 | if i[0] not in temp.keys():
67 | temp[i[0]] = i[1]
68 | freq_dict[i[0]] = 1
69 | else:
70 | curr = temp.get(i[0])
71 | freq_dict[i[0]] += 1
72 | temp[i[0]] = (curr + i[1]) / freq_dict.get(i[0])
73 |
74 |
75 |
76 |
77 |
78 |
--------------------------------------------------------------------------------
/union_arrays.py:
--------------------------------------------------------------------------------
1 | # Problem: Given two sorted array of sizes m and n
2 | # in which all elements are distinct. Find the
3 | # union between them
4 | # Constraints: in O(m+n) complexity.
5 |
6 | def unionArrays(x, y, m, n):
7 | union_arr = []
8 |
9 | for i in range(m):
10 | union_arr.append(x[i])
11 |
12 | for j in range(n):
13 | if y[j] not in union_arr:
14 | union_arr.append(y[j])
15 |
16 | print(union_arr)
17 | return
18 |
19 | list_a = [1, 2, 3, 4, 5]
20 | list_b = [2, 3, 5, 6]
21 | unionArrays(list_a, list_b, len(list_a), len(list_b))
22 |
--------------------------------------------------------------------------------
/username_validation.py:
--------------------------------------------------------------------------------
1 | # Have the function UsernameValidation(`str`) take the `str` parameter being passed and determine if the string is a valid username according to the following rules:
2 |
3 | # 1. The username is between 4 and 25 characters.
4 | # 2. It must start with a letter.
5 | # 3. It can only contain letters, numbers, and the underscore character.
6 | # 4. It cannot end with an underscore character.
7 |
8 | #If the username is valid then your program should return the string `true`, otherwise return the string `false`.
9 |
10 | def UsernameValidation(strParam):
11 |
12 | # username is between 4 and 25 characters
13 | if len(strParam) < 4 or len(strParam) > 25 :
14 | return False
15 |
16 | # start with a letter
17 | if not str(strParam[0]).isalpha():
18 | return False;
19 |
20 | # can't end with an underscore
21 | if str(strParam[-1] ) == '_':
22 | return False;
23 |
24 | # contains only letters, numbers and underscore
25 | valid_grammar = set('abcdefghijklmnopqrstuvwxyz0123456789_')
26 |
27 | for ch in strParam:
28 | if ch.lower() not in valid_grammar:
29 | return False;
30 |
31 | return True
32 |
33 | # keep this function call here
34 | TC1 = "aa_"
35 | TC2 = "uaa__hello_worldW"
36 |
37 | print(TC1, UsernameValidation(TC1))
38 | print(TC2, UsernameValidation(TC2))
39 |
--------------------------------------------------------------------------------