├── canonscraper.py └── README.md /canonscraper.py: -------------------------------------------------------------------------------- 1 | """Retrieve vote score (in sorted order by post type) for all posts listed in README.md.""" 2 | from bs4 import BeautifulSoup 3 | import requests 4 | import re 5 | import pandas as pd 6 | import html 7 | 8 | user_id = '4909087' 9 | user_url = f'https://api.stackexchange.com/2.2/users/{user_id}?order=desc&sort=reputation&site=stackoverflow' 10 | username = requests.get(user_url).json()['items'][0]['display_name'] 11 | 12 | repo = 'https://github.com/Coldsp33d/stackoverflow-pandas-canonicals/blob/master/README.md' 13 | posts_url = 'https://api.stackexchange.com/2.2/posts/{}?pagesize=100&order=desc&sort=votes&site=stackoverflow' 14 | url = { 15 | 'question': 'https://api.stackexchange.com/2.2/questions/{}?order=desc&sort=activity&site=stackoverflow', 16 | 'answer': 'https://api.stackexchange.com/2.2/answers/{}?order=desc&sort=activity&site=stackoverflow&filter=!-*jbN.OXJB.4&pagesize=100' 17 | } 18 | 19 | soup = BeautifulSoup(requests.get(repo).text, 'lxml') 20 | 21 | ids = set() 22 | for tag in soup.find_all(href=re.compile('stackoverflow.com/.*')): 23 | ids = ids.union(re.findall(r"\b\d+\b", tag['href'])) 24 | 25 | ids -= {user_id} 26 | 27 | 28 | data = requests.get(posts_url.format(';'.join(ids))).json()['items'] 29 | df = pd.DataFrame([ 30 | {'post_id': str(p['post_id']), 'type': p['post_type']} 31 | for p in data if p['owner']['display_name'] == username 32 | ] 33 | ) 34 | # Get the answers to all my questions. 35 | canonq_answers = (df.query('type == "question"')['post_id'] 36 | .astype(int) 37 | .add(1) 38 | .astype(str) 39 | .to_frame() 40 | .assign(type='answer')) 41 | df = pd.concat([df, canonq_answers], ignore_index=True) 42 | 43 | data = pd.DataFrame(df.groupby('type')[['post_id']] 44 | .agg(';'.join) 45 | .apply(lambda x: requests.get(url[x.name].format(x['post_id'])).json()['items'], axis=1) 46 | .to_frame('post_info') 47 | .apply(lambda x: [{ 48 | f'post_id': str(p[f'{x.name}_id']), 49 | 'title': html.unescape(p['title']), 50 | 'score': p['score'], 51 | 'link': p['link'], 52 | 'creation_date': p['creation_date'], 53 | } for p in x['post_info'] 54 | ], axis=1) 55 | .sum()) 56 | # Get the age of each post. 57 | days_since_creation = (pd.Timestamp('today') - pd.to_datetime(data.pop('creation_date'), unit='s', origin='unix')).dt.days 58 | # How many votes you're expected to get in a year. This is expected to change with age, number of votes, and (for answers) position on the page. 59 | data.insert(data.columns.get_loc('score') + 1, 60 | 'passive_rep_rate', 61 | (data['score'] / days_since_creation * 365.25).astype(int)) 62 | 63 | print(df.merge(data, on='post_id') 64 | .sort_values(['type', 'passive_rep_rate'], ascending=[1, 0]) 65 | .reset_index(drop=1) 66 | .set_index('post_id') 67 | .to_string()) 68 | 69 | 70 | """ 71 | type title score passive_rep_rate link 72 | post_id 73 | 55557758 answer How to iterate over rows in a DataFrame in Pandas 576 430 https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas/55557758#55557758 74 | 53645883 answer Pandas Merging 101 571 341 https://stackoverflow.com/questions/53645882/pandas-merging-101/53645883#53645883 75 | 54508052 answer Convert pandas dataframe to NumPy array 269 178 https://stackoverflow.com/questions/13187778/convert-pandas-dataframe-to-numpy-array/54508052#54508052 76 | 61922965 answer pandas GroupBy columns with NaN (missing) values 35 161 https://stackoverflow.com/questions/18429491/pandas-groupby-columns-with-nan-missing-values/61922965#61922965 77 | 56746204 answer Creating an empty Pandas DataFrame, then filling it? 175 155 https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it/56746204#56746204 78 | ... 79 | 53645882 question Pandas Merging 101 395 236 https://stackoverflow.com/questions/53645882/pandas-merging-101 80 | 53927460 question Select rows in pandas MultiIndex DataFrame 147 90 https://stackoverflow.com/questions/53927460/select-rows-in-pandas-multiindex-dataframe 81 | 54432583 question When should I ever want to use pandas apply() in my code? 109 71 https://stackoverflow.com/questions/54432583/when-should-i-ever-want-to-use-pandas-apply-in-my-code 82 | 54028199 question Are for-loops in pandas really bad? When should I care? 104 65 https://stackoverflow.com/questions/54028199/are-for-loops-in-pandas-really-bad-when-should-i-care 83 | 53779986 question Dynamic Expression Evaluation in pandas using pd.eval() 57 34 https://stackoverflow.com/questions/53779986/dynamic-expression-evaluation-in-pandas-using-pd-eval 84 | ... 85 | """ 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A compilation of all my canonical posts and answers to old questions on Stack Overflow. Topic arrangement mirrors the [User Guide](https://pandas.pydata.org/pandas-docs/stable/user_guide/). 2 | 3 | If you find any bugs, or need clarification, or see something that can be improved, please feel free to leave a comment under the answer and I'll typically respond within a day. 4 | 5 | If you found any of my content here helpful and wish to thank me, you can upvote my answer! (please don't serial upvote :-) If you'd like to do more, and have more than 75 reputation on Stack Overflow, please consider awarding me with a [bounty](https://stackoverflow.com/help/privileges/set-bounties). 6 | 7 | ## Pandas Gotchas 8 | 9 | - Don't iterate over a DataFrame! 10 | - [How to iterate over rows in a DataFrame in Pandas?](https://stackoverflow.com/a/55557758/4909087) 11 | - [Does pandas iterrows have performance issues?](https://stackoverflow.com/a/65356169/4909087) 12 | 13 | - Never grow a DataFrame! 14 | - [Creating an empty Pandas DataFrame, then filling it?](https://stackoverflow.com/a/56746204/4909087) 15 | - [Add one row to pandas DataFrame](https://stackoverflow.com/a/62734983/4909087) 16 | 17 | - Good habits to build to avoid that dreaded `SettingWithCopyWarning` 18 | - [How to deal with SettingWithCopyWarning in Pandas?](https://stackoverflow.com/a/53954986/4909087) 19 | 20 | - Don't use `inplace=True`! 21 | - [Understanding inplace=True](https://stackoverflow.com/a/59242208/4909087) 22 | - [Pandas - is inplace = True considered harmful or not?](https://stackoverflow.com/q/45570984/4909087) 23 | 24 | 25 | ## IO tools (text, CSV, HDF5, …) 26 | 27 | - [How can I effectively load data on Stack Overflow questions using pandas read_clipboard?](https://stackoverflow.com/q/65379068/4909087) 28 | - [Writing a pandas DataFrame to CSV file](https://stackoverflow.com/a/56241457/4909087) 29 | - [Import CSV file as a pandas DataFrame](https://stackoverflow.com/a/56231664/4909087) 30 | - [How do I save multi-indexed pandas dataframes to parquet?](https://stackoverflow.com/a/65356509/4909087) 31 | 32 | 33 | ## Indexing and selecting data 34 | - [How to implement 'in' and 'not in' for Pandas dataframe](https://stackoverflow.com/a/55554709/4909087) 35 | - [Combine duplicated columns within a DataFrame](https://stackoverflow.com/a/54300430/4909087) 36 | - [Deleting all columns except a few](https://stackoverflow.com/a/54315757/4909087) 37 | - [Right way to reverse a pandas DataFrame?](https://stackoverflow.com/a/65391420/4909087) 38 | 39 | ## MultiIndex / advanced indexing 40 | - [How do I slice or filter MultiIndex DataFrame levels?](https://stackoverflow.com/questions/53927460/select-rows-in-pandas-multiindex-dataframe) 41 | - [Selecting columns from pandas MultiIndex](https://stackoverflow.com/a/54337009/4909087) 42 | - [Setting DataFrame column headers to a MultiIndex](https://stackoverflow.com/a/54335583/4909087) 43 | - [Reorder levels of MultiIndex in a pandas DataFrame](https://stackoverflow.com/a/62746392/4909087) 44 | 45 | 46 | ## Merge, join, and concatenate 47 | - **Pandas Merging 101** 48 | - [Merging basics - basic types of joins](https://stackoverflow.com/a/53645883/4909087) (read this first) 49 | 50 | - [Index-based joins](https://stackoverflow.com/a/65167356/4909087) 51 | 52 | - [Generalizing to multiple DataFrames](https://stackoverflow.com/a/65167327/4909087) 53 | 54 | 55 | - [Performant Cross join](https://stackoverflow.com/a/53699013/4909087) 56 | 57 | - [Cartesian Product in pandas](https://stackoverflow.com/a/65017552/4909087) 58 | 59 | ## Reshaping and pivot tables 60 | - [Split (explode) pandas dataframe string entry to separate rows](https://stackoverflow.com/a/57122617/4909087) 61 | - [Pandas column of lists, create a row for each list element](https://stackoverflow.com/a/57122831/4909087) 62 | 63 | 64 | ## Working with text data 65 | - [Convert Columns to String in Pandas](https://stackoverflow.com/a/62978895/4909087) Introduce `"string"` dtype for pandas >= 1.0. 66 | - [Fast punctuation removal with pandas](https://stackoverflow.com/questions/50444346/fast-punctuation-removal-with-pandas) 67 | - [How to lowercase a pandas dataframe string column if it has missing values?](https://stackoverflow.com/a/56084317/4909087) 68 | 69 | - [String concatenation of two pandas columns](https://stackoverflow.com/a/54298586/4909087) 70 | - [Remove unwanted parts from strings in a column](https://stackoverflow.com/a/54302517/4909087) 71 | - [Select by partial string from a pandas DataFrame](https://stackoverflow.com/a/55335207/4909087) 72 | - [Get first letter of a string from column](https://stackoverflow.com/a/55532764/4909087) 73 | 74 | 75 | ## Working with missing data 76 | - [How to drop rows of Pandas DataFrame whose value in a certain column is NaN?](https://stackoverflow.com/a/62444845/4909087) 77 | - [GroupBy columns with NaN (missing) values](https://stackoverflow.com/a/61922965/4909087) 78 | - [How to check if any value is NaN in a Pandas DataFrame](https://stackoverflow.com/a/53862445/4909087) 79 | - [Convert pandas.Series from dtype object to float, and errors to nans](https://stackoverflow.com/a/47942854/4909087) 80 | - [How to replace values with None in Pandas data frame in Python?](https://stackoverflow.com/a/55469393/4909087) 81 | - [Locate first and last non NaN values in a Pandas DataFrame](https://stackoverflow.com/a/56748194/4909087) - a discussion on `first_valid_index` and `last_valid_index` 82 | 83 | ## Categorical data 84 | 85 | 86 | ## Nullable integer data type 87 | - [Pandas: ValueError: cannot convert float NaN to integer](https://stackoverflow.com/a/55704512/4909087) 88 | 89 | ## Nullable Boolean Data Type 90 | - [Preserve NaN values in pandas boolean comparisons](https://stackoverflow.com/a/60203554/4909087) 91 | 92 | 93 | ## Visualization 94 | - [Compare two DataFrames and output their differences side-by-side](https://stackoverflow.com/a/62687227/4909087) 95 | - [Pretty Printing a pandas dataframe](https://stackoverflow.com/a/60202636/4909087) 96 | 97 | 98 | ## Computational tools 99 | 100 | 101 | ## Group By: split-apply-combine 102 | - [Multiple aggregations of the same column using pandas GroupBy.agg()](https://stackoverflow.com/a/54300159/4909087) 103 | - [Pandas GroupBy.apply method duplicates first group](https://stackoverflow.com/a/56215416/4909087) 104 | - [Get statistics for each group (such as count, mean, etc) using pandas GroupBy?](https://stackoverflow.com/a/55564299/4909087) 105 | - [GroupBy pandas DataFrame and select most common value](https://stackoverflow.com/a/54304691/4909087) 106 | - [Python Pandas Create New Column with Groupby().Sum()](https://stackoverflow.com/a/54417351/4909087) 107 | - [How to get number of groups in a groupby object in pandas?](https://stackoverflow.com/a/46512052/4909087) 108 | 109 | 110 | ## Time series / date functionality 111 | - [pandas datetime to unix timestamp seconds](https://stackoverflow.com/a/54313505/4909087) 112 | 113 | ## Time deltas 114 | 115 | 116 | ## Styling 117 | 118 | 119 | ## Options and settings 120 | 121 | 122 | ## Performance 123 | - [For loops with pandas - When should I care?](https://stackoverflow.com/questions/54028199/for-loops-with-pandas-when-should-i-care) 124 | - [Dynamic Expression Evaluation in pandas using pd.eval()](https://stackoverflow.com/questions/53779986/dynamic-expression-evaluation-in-pandas-using-pd-eval) 125 | - [When should I (not) want to use pandas apply() in my code?](https://stackoverflow.com/questions/54432583/when-should-i-ever-want-to-use-pandas-apply-in-my-code) 126 | - [Performant cartesian product (CROSS JOIN) with pandas](https://stackoverflow.com/questions/53699012/performant-cartesian-product-cross-join-with-pandas) 127 | - [What is the performance impact of non-unique indexes in pandas?](https://stackoverflow.com/a/54317984/4909087) 128 | 129 | 130 | ## Scaling to large datasets 131 | 132 | 133 | ## Sparse data structures 134 | 135 | - [Whats the right way to use a SparseDataFrame in Pandas?](https://stackoverflow.com/a/65324029/4909087) 136 | 137 | ## Frequently Asked Questions (FAQ) 138 | - [How to iterate over rows in a DataFrame in Pandas?](https://stackoverflow.com/a/55557758/4909087) 139 | - [What is the difference between Series.replace and Series.str.replace?](https://stackoverflow.com/questions/56625031/what-is-the-difference-between-series-replace-and-series-str-replace) 140 | - [Add column of empty lists to DataFrame](https://stackoverflow.com/a/62141252/4909087) 141 | - [Change data type of columns in Pandas](https://stackoverflow.com/a/60278450/4909087) 142 | - [Drop rows containing empty cells from a pandas DataFrame](https://stackoverflow.com/a/56708633/4909087) 143 | - [How to get rid of “Unnamed: 0” column in a pandas DataFrame?](https://stackoverflow.com/a/54358758/4909087) 144 | - [Difference between map, applymap and apply methods in Pandas](https://stackoverflow.com/a/56300992/4909087) 145 | - [Convert list of dictionaries to a pandas DataFrame](https://stackoverflow.com/a/53831756/4909087) 146 | - [Find the max of two or more columns with pandas](https://stackoverflow.com/a/54299629/4909087) 147 | - [Sorting by absolute value without changing the data](https://stackoverflow.com/a/54299995/4909087) 148 | - [How do I convert a pandas column or index to a Numpy array?](https://stackoverflow.com/a/54324513/4909087) 149 | - [Convert pandas dataframe to NumPy array](https://stackoverflow.com/a/54508052/4909087) 150 | - [Logical operators for boolean indexing in Pandas](https://stackoverflow.com/a/54358361/4909087) 151 | - [What is the difference between size and count in pandas?](https://stackoverflow.com/a/54364400/4909087) 152 | - ['DataFrame' object has no attribute 'sort'](https://stackoverflow.com/a/54399214/4909087) 153 | - [Python pandas insert list into a cell](https://stackoverflow.com/a/54399996/4909087) 154 | - [How to add a suffix (or prefix) to each column name?](https://stackoverflow.com/a/54410631/4909087) 155 | - [How do I get the row count of a Pandas dataframe?](https://stackoverflow.com/a/55435185/4909087) 156 | - [Rename a specific column in pandas](https://stackoverflow.com/a/46146667/4909087) 157 | - [Get list from pandas DataFrame column headers](https://stackoverflow.com/a/55491499/4909087) 158 | 159 | 160 | 161 | ## Cookbook 162 | 163 | 164 | 165 | 166 | 167 | --------------------------------------------------------------------------------