├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md └── datasets ├── PandasEval1.json └── PandasEval2.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 5 | and actually do, grant us the rights to use your contribution. For details, visit 6 | https://cla.microsoft.com. 7 | 8 | ## Code of conduct 9 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 10 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 11 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jigsaw Datasets 2 | The datasets folder contains the two datasets described in our [paper](https://arxiv.org/abs/2112.02969) - 3 | 4 | 1.) PandasEval1 - This dataset was collected by authors of the paper and consists of 68 entries 5 | 6 | 2.) PandasEval2 - This dataset was collected in the form of a hackathon user study across two sessions differentiating tasks. Each task contains multiple sets with minor variations such as scalar/constant differences. Some tasks might have semantically different sets. It comprises of 21 unique tasks, and for every task at most 5 variations/sets. For each set there are multiple natural language variations leading to a total of 725 entries. 7 | 8 | Both of these jsons follow the structure as described below. 9 | 10 | * The outermost level contains key-value pairs with the unique task id. 11 | * For each task, we have key-value pairs for the various sets in the task. 12 | * For each set, we have 13 | - a list of queries along with user-ids who wrote those queries 14 | - one or more io examples. Each io example is a dict containing 15 | + code snippet for inputs 16 | + code snippet for output 17 | + corresponding names for inputs and outputs 18 | - one or more correct solutions 19 | 20 | In case you find this work useful, please cite it as 21 | ``` 22 | @inproceedings{Jigsaw, 23 | author = {Jain, Naman and Vaidyanath, Skanda and Iyer, Arun and Natarajan, Nagarajan and Parthasarathy, Suresh and Rajamani, Sriram and Sharma, Rahul}, 24 | title = {Jigsaw: Large Language Models meet Program Synthesis}, 25 | booktitle = {ICSE 2022}, 26 | location = {Pittsburgh, Pennsylvania}, 27 | } 28 | ``` -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /datasets/PandasEval1.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "sets": { 4 | "A": { 5 | "queries": [ 6 | { 7 | "query": "Retain rows from dataframe df1 where value of EPS is not equal to 89", 8 | "user": "u0" 9 | } 10 | ], 11 | "ios": [ 12 | { 13 | "inputs": [ 14 | "pd.DataFrame({'STK_ID': {0: 601166.0, 1: 600036.0, 2: 600016.0, 3: 601009.0, 4: 601939.0, 5: 1.0}, 'RPT_Date': {0: 20111231.0, 1: 20111231.0, 2: 20111231.0, 3: 20111231.0, 4: 20111231.0, 5: 20111231.0}, 'STK_ID1': {0: 601166.0, 1: 600036.0, 2: 600016.0, 3: 601009.0, 4: 601939.0, 5: 1.0}, 'EPS': {0: 89.0, 1: 89.0, 2: 4.3, 3: 89.0, 4: 2.5, 5: 89.0}, 'cash': {0: 0.0, 1: 12.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0}})" 15 | ], 16 | "output": "pd.DataFrame({'STK_ID': {2: 600016.0, 4: 601939.0}, 'RPT_Date': {2: 20111231.0, 4: 20111231.0}, 'STK_ID1': {2: 600016.0, 4: 601939.0}, 'EPS': {2: 4.3, 4: 2.5}, 'cash': {2: 0.0, 4: 0.0}})", 17 | "invars": [ 18 | "df1" 19 | ], 20 | "outvar": "dfout" 21 | } 22 | ], 23 | "solutions": [ 24 | [ 25 | "dfout = df1[(df1['EPS'] != 89)]" 26 | ] 27 | ] 28 | } 29 | } 30 | }, 31 | "1": { 32 | "sets": { 33 | "A": { 34 | "queries": [ 35 | { 36 | "query": "take first three rows in df1 and assign it to dfout", 37 | "user": "u0" 38 | } 39 | ], 40 | "ios": [ 41 | { 42 | "inputs": [ 43 | "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2, 3: 1, 4: 9, 5: -5, 6: 3}, 'B': {0: 2, 1: 4, 2: 6, 3: 1, 4: 1, 5: 7, 6: 3}})" 44 | ], 45 | "output": "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2}, 'B': {0: 2, 1: 4, 2: 6}})", 46 | "invars": [ 47 | "df1" 48 | ], 49 | "outvar": "dfout" 50 | } 51 | ], 52 | "solutions": [ 53 | [ 54 | "dfout = df1[:3]" 55 | ] 56 | ] 57 | } 58 | } 59 | }, 60 | "2": { 61 | "sets": { 62 | "A": { 63 | "queries": [ 64 | { 65 | "query": "Delete the rows of data frame 'df2' from 'df1' and assign the result back to 'df1'", 66 | "user": "u0" 67 | } 68 | ], 69 | "ios": [ 70 | { 71 | "inputs": [ 72 | "pd.DataFrame({'C1': {0: 'E1', 1: 'E3'}, 'C2': {0: 'E2', 1: 'E4'}})", 73 | "pd.DataFrame({'C1': {0: 'E1'}, 'C2': {0: 'E2'}})" 74 | ], 75 | "output": "pd.DataFrame({'C1': {1: 'E3'}, 'C2': {1: 'E4'}})", 76 | "invars": [ 77 | "df1", 78 | "df2" 79 | ], 80 | "outvar": "df1" 81 | }, 82 | { 83 | "inputs": [ 84 | "pd.DataFrame({'one': {0: 1, 1: 4, 2: 7, 3: 10}, 'two': {0: 2, 1: 5, 2: 8, 3: 11}, 'three': {0: 3, 1: 6, 2: 9, 3: 12}})", 85 | "pd.DataFrame({'one': {0: 1, 1: 7}, 'two': {0: 2, 1: 8}, 'three': {0: 3, 1: 9}})" 86 | ], 87 | "output": "pd.DataFrame({'one': {0: 4, 1: 10}, 'two': {0: 5, 1: 11}, 'three': {0: 6, 1: 12}})", 88 | "invars": [ 89 | "df1", 90 | "df2" 91 | ], 92 | "outvar": "df1" 93 | } 94 | ], 95 | "solutions": [ 96 | [ 97 | "df1 = df1.drop(df2.index)" 98 | ] 99 | ] 100 | } 101 | } 102 | }, 103 | "3": { 104 | "sets": { 105 | "A": { 106 | "queries": [ 107 | { 108 | "query": "Given the dataframe df1 and replace all instances of 5 in column 'id' with 1005, column 'idnew' with -1005 and assign it to dfout", 109 | "user": "u0" 110 | } 111 | ], 112 | "ios": [ 113 | { 114 | "inputs": [ 115 | "pd.DataFrame({'id': {0: 5, 1: -45, 2: 16, 3: -1, 4: 111}, 'idnew': {0: 15, 1: 18, 2: 16, 3: -111, 4: 5}, 'A': {0: 'a', 1: 'x', 2: 'r', 3: 'uprime', 4: 'None'}, 'B': {0: 'z', 1: 'None', 2: 'x', 3: 'wwww', 4: 'xy'}})" 116 | ], 117 | "output": "pd.DataFrame({'id': {0: 1005, 1: -45, 2: 16, 3: -1, 4: 111}, 'idnew': {0: 15, 1: 18, 2: 16, 3: -111, 4: -1005}, 'A': {0: 'a', 1: 'x', 2: 'r', 3: 'uprime', 4: 'None'}, 'B': {0: 'z', 1: 'None', 2: 'x', 3: 'wwww', 4: 'xy'}})", 118 | "invars": [ 119 | "df1" 120 | ], 121 | "outvar": "dfout" 122 | } 123 | ], 124 | "solutions": [ 125 | [ 126 | "dfout = df1.replace(to_replace={\n 'id': {\n 5: 1005,\n },\n 'idnew': {\n 5: (- 1005),\n },\n}, method='ffill')", 127 | "dfout = df1.replace(to_replace={\n 'id': {\n 5: 1005,\n },\n 'idnew': {\n 5: (- 1005),\n },\n})", 128 | "dfout = df1.replace(to_replace={\n 'id': {\n 5: 1005,\n },\n 'idnew': {\n 5: (- 1005),\n },\n}, method='pad')", 129 | "dfout = df1.replace(to_replace={\n 'id': {\n 5: 1005,\n },\n 'idnew': {\n 5: (- 1005),\n },\n}, method='bfill')" 130 | ] 131 | ] 132 | } 133 | } 134 | }, 135 | "4": { 136 | "sets": { 137 | "A": { 138 | "queries": [ 139 | { 140 | "query": "Delete the rows of data frame 'df2' from 'df1' and assign the result back to 'df2'", 141 | "user": "u0" 142 | } 143 | ], 144 | "ios": [ 145 | { 146 | "inputs": [ 147 | "pd.DataFrame({'C1': {0: 'E1', 1: 'E3'}, 'C2': {0: 'E2', 1: 'E4'}})", 148 | "pd.DataFrame({'C1': {0: 'E1'}, 'C2': {0: 'E2'}})" 149 | ], 150 | "output": "pd.DataFrame({'C1': {1: 'E3'}, 'C2': {1: 'E4'}})", 151 | "invars": [ 152 | "df1", 153 | "df2" 154 | ], 155 | "outvar": "df2" 156 | }, 157 | { 158 | "inputs": [ 159 | "pd.DataFrame({'one': {0: 1, 1: 4, 2: 7, 3: 10}, 'two': {0: 2, 1: 5, 2: 8, 3: 11}, 'three': {0: 3, 1: 6, 2: 9, 3: 12}})", 160 | "pd.DataFrame({'one': {0: 1, 1: 7}, 'two': {0: 2, 1: 8}, 'three': {0: 3, 1: 9}})" 161 | ], 162 | "output": "pd.DataFrame({'one': {0: 4, 1: 10}, 'two': {0: 5, 1: 11}, 'three': {0: 6, 1: 12}})", 163 | "invars": [ 164 | "df1", 165 | "df2" 166 | ], 167 | "outvar": "df2" 168 | } 169 | ], 170 | "solutions": [ 171 | [ 172 | "df2 = df1.drop(labels=[0], errors='ignore')", 173 | "df2 = df1.drop(df2.index)", 174 | "df2 = df1.drop(labels=[0])" 175 | ] 176 | ] 177 | } 178 | } 179 | }, 180 | "5": { 181 | "sets": { 182 | "A": { 183 | "queries": [ 184 | { 185 | "query": "Drop rows from dataframe df1 where value of EPS is equal to 89", 186 | "user": "u0" 187 | } 188 | ], 189 | "ios": [ 190 | { 191 | "inputs": [ 192 | "pd.DataFrame({'STK_ID': {0: 601166.0, 1: 600036.0, 2: 600016.0, 3: 601009.0, 4: 601939.0, 5: 1.0}, 'RPT_Date': {0: 20111231.0, 1: 20111231.0, 2: 20111231.0, 3: 20111231.0, 4: 20111231.0, 5: 20111231.0}, 'STK_ID1': {0: 601166.0, 1: 600036.0, 2: 600016.0, 3: 601009.0, 4: 601939.0, 5: 1.0}, 'EPS': {0: 89.0, 1: 89.0, 2: 4.3, 3: 89.0, 4: 2.5, 5: 89.0}, 'cash': {0: 0.0, 1: 12.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0}})" 193 | ], 194 | "output": "pd.DataFrame({'STK_ID': {2: 600016.0, 4: 601939.0}, 'RPT_Date': {2: 20111231.0, 4: 20111231.0}, 'STK_ID1': {2: 600016.0, 4: 601939.0}, 'EPS': {2: 4.3, 4: 2.5}, 'cash': {2: 0.0, 4: 0.0}})", 195 | "invars": [ 196 | "df1" 197 | ], 198 | "outvar": "dfout" 199 | }, 200 | { 201 | "inputs": [ 202 | "pd.DataFrame({'a': {0: 1.0, 1: 3.0, 2: 89.0, 3: -1.0, 4: -2.0}, 'EPS': {0: 2.0, 1: 89.0, 2: 5.0, 3: -1.0, 4: -2.0}})" 203 | ], 204 | "output": "pd.DataFrame({'a': {0: 1.0, 1: 89.0, 2: -1.0, 3: -2.0}, 'EPS': {0: 2.0, 1: 5.0, 2: -1.0, 3: -2.0}})", 205 | "invars": [ 206 | "df1" 207 | ], 208 | "outvar": "dfout" 209 | } 210 | ], 211 | "solutions": [ 212 | [ 213 | "dfout = df1[(df1['EPS'] != 89)]" 214 | ] 215 | ] 216 | } 217 | } 218 | }, 219 | "6": { 220 | "sets": { 221 | "A": { 222 | "queries": [ 223 | { 224 | "query": "Apply min-max normalization on df1 and assign the result to dfout", 225 | "user": "u0" 226 | } 227 | ], 228 | "ios": [ 229 | { 230 | "inputs": [ 231 | "pd.DataFrame({'C1': {0: 10, 1: 10, 2: 20, 3: 20}, 'C2': {0: 15, 1: 15, 2: 20, 3: 20}})" 232 | ], 233 | "output": "pd.DataFrame({'C1': {0: 0.0, 1: 0.0, 2: 1.0, 3: 1.0}, 'C2': {0: 0.0, 1: 0.0, 2: 1.0, 3: 1.0}})", 234 | "invars": [ 235 | "df1" 236 | ], 237 | "outvar": "dfout" 238 | } 239 | ], 240 | "solutions": [ 241 | [ 242 | "dfout = df1.apply((lambda x: ((x - min(x)) / (max(x) - min(x)))))" 243 | ] 244 | ] 245 | } 246 | } 247 | }, 248 | "7": { 249 | "sets": { 250 | "A": { 251 | "queries": [ 252 | { 253 | "query": "Apply mean normalization on the columns of df1 and store the result in dfout", 254 | "user": "u0" 255 | } 256 | ], 257 | "ios": [ 258 | { 259 | "inputs": [ 260 | "pd.DataFrame({'C1': {0: 10, 1: 20}, 'C2': {0: 15, 1: 25}})" 261 | ], 262 | "output": "pd.DataFrame({'C1': {0: -0.707107, 1: 0.707107}, 'C2': {0: -0.707107, 1: 0.707107}})", 263 | "invars": [ 264 | "df1" 265 | ], 266 | "outvar": "dfout" 267 | } 268 | ], 269 | "solutions": [ 270 | [ 271 | "dfout = ((df1 - df1.mean()) / df1.std())" 272 | ] 273 | ] 274 | } 275 | } 276 | }, 277 | "8": { 278 | "sets": { 279 | "A": { 280 | "queries": [ 281 | { 282 | "query": "Select the third value in the STID column of mydata and store it in val", 283 | "user": "u0" 284 | } 285 | ], 286 | "ios": [ 287 | { 288 | "inputs": [ 289 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})" 290 | ], 291 | "output": "3.0", 292 | "invars": [ 293 | "mydata" 294 | ], 295 | "outvar": "val" 296 | } 297 | ], 298 | "solutions": [ 299 | [ 300 | "val = mydata['STID'][2]", 301 | "val = mydata.iloc[(2, 0)]" 302 | ] 303 | ] 304 | } 305 | } 306 | }, 307 | "9": { 308 | "sets": { 309 | "A": { 310 | "queries": [ 311 | { 312 | "query": "Create a new column 'C' in df1 as the sum of the columns 'A' and 'B'", 313 | "user": "u0" 314 | } 315 | ], 316 | "ios": [ 317 | { 318 | "inputs": [ 319 | "pd.DataFrame({'A': {0: 5.0, 1: 0.0, 2: 2.0, 3: 1.0, 4: 9.0, 5: -5.0, 6: 3.0}, 'B': {0: 2.0, 1: 4.0, 2: 6.0, 3: 1.0, 4: 1.0, 5: 7.0, 6: 3.0}})" 320 | ], 321 | "output": "pd.DataFrame({'A': {0: 5.0, 1: 0.0, 2: 2.0, 3: 1.0, 4: 9.0, 5: -5.0, 6: 3.0}, 'B': {0: 2.0, 1: 4.0, 2: 6.0, 3: 1.0, 4: 1.0, 5: 7.0, 6: 3.0}, 'C': {0: 7.0, 1: 4.0, 2: 8.0, 3: 2.0, 4: 10.0, 5: 2.0, 6: 6.0}})", 322 | "invars": [ 323 | "df1" 324 | ], 325 | "outvar": "df1" 326 | } 327 | ], 328 | "solutions": [ 329 | [ 330 | "df1['C'] = (df1['A'] + df1['B'])" 331 | ] 332 | ] 333 | } 334 | } 335 | }, 336 | "10": { 337 | "sets": { 338 | "A": { 339 | "queries": [ 340 | { 341 | "query": "Select the first three rows of the second column of the mydata and store it in out", 342 | "user": "u0" 343 | } 344 | ], 345 | "ios": [ 346 | { 347 | "inputs": [ 348 | "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2, 3: 1, 4: 9, 5: -5, 6: 3}, 'B': {0: 2, 1: 4, 2: 6, 3: 1, 4: 1, 5: 7, 6: 3}})" 349 | ], 350 | "output": "_s = pd.Series([2, 4, 6])\n_s.name = \"B\"\n_s", 351 | "invars": [ 352 | "mydata" 353 | ], 354 | "outvar": "out" 355 | }, 356 | { 357 | "inputs": [ 358 | "pd.DataFrame({'a': {0: 1.0, 1: 3.0, 2: 89.0, 3: -1.0, 4: -2.0}, 'EPS': {0: 2.0, 1: 89.0, 2: 5.0, 3: -1.0, 4: -2.0}})" 359 | ], 360 | "output": "pd.DataFrame({'EPS': {0: 2.0, 1: 89.0, 2: 5.0}})", 361 | "invars": [ 362 | "mydata" 363 | ], 364 | "outvar": "out" 365 | } 366 | ], 367 | "solutions": [ 368 | [ 369 | "out = mydata.iloc[:3, 1]" 370 | ] 371 | ] 372 | } 373 | } 374 | }, 375 | "11": { 376 | "sets": { 377 | "A": { 378 | "queries": [ 379 | { 380 | "query": "Compute the fraction of non-zeros in the score column of the dataframe df1", 381 | "user": "u0" 382 | } 383 | ], 384 | "ios": [ 385 | { 386 | "inputs": [ 387 | "pd.DataFrame({'score': {0: 1.0, 1: 2.5, 2: 3.0, 3: 0.0, 4: 0.0, 5: 0.1}, 'C2': {0: 'E2', 1: 'E4', 2: 'E1', 3: 'E5', 4: 'E6', 5: 'E7'}})" 388 | ], 389 | "output": "0.6666666666666666", 390 | "invars": [ 391 | "df1" 392 | ], 393 | "outvar": "dfout" 394 | } 395 | ], 396 | "solutions": [ 397 | [ 398 | "dfout = sum(df1.score != 0)/len(df1.score)" 399 | ] 400 | ] 401 | } 402 | } 403 | }, 404 | "12": { 405 | "sets": { 406 | "A": { 407 | "queries": [ 408 | { 409 | "query": "List the unique values of 'C2'", 410 | "user": "u0" 411 | } 412 | ], 413 | "ios": [ 414 | { 415 | "inputs": [ 416 | "pd.DataFrame({'score': {0: 1.0, 1: 2.5, 2: 44.0, 3: 3.0, 4: 0.0, 5: 1.3, 6: 7.0}, 'C2': {0: 'E2', 1: 'E4', 2: 'E1', 3: 'E1', 4: 'E5', 5: 'E6', 6: 'E2'}})" 417 | ], 418 | "output": "np.array(['E2', 'E4', 'E1', 'E5', 'E6'])", 419 | "invars": [ 420 | "df1" 421 | ], 422 | "outvar": "dfout" 423 | } 424 | ], 425 | "solutions": [ 426 | [ 427 | "dfout = df1.C2.unique()" 428 | ] 429 | ] 430 | } 431 | } 432 | }, 433 | "13": { 434 | "sets": { 435 | "A": { 436 | "queries": [ 437 | { 438 | "query": "For each quarter find the subsidiary with top earnings value", 439 | "user": "u0" 440 | } 441 | ], 442 | "ios": [ 443 | { 444 | "inputs": [ 445 | "pd.DataFrame({'Quarter': {1: 'Q1', 2: 'Q1', 3: 'Q2', 4: 'Q2'}, 'Subsidiary': {1: 'US', 2: 'Kenya', 3: 'US', 4: 'India'}, 'Earnings': {1: 600, 2: 200, 3: 150, 4: 200}})" 446 | ], 447 | "output": "pd.DataFrame({'Quarter': {1: 'Q1', 4: 'Q2'}, 'Subsidiary': {1: 'US', 4: 'India'}, 'Earnings': {1: 600, 4: 200}})", 448 | "invars": [ 449 | "df1" 450 | ], 451 | "outvar": "dfout" 452 | } 453 | ], 454 | "solutions": [ 455 | [ 456 | "dfout = df1.loc[df1.groupby('Quarter')['Earnings'].idxmax()]" 457 | ] 458 | ] 459 | } 460 | } 461 | }, 462 | "14": { 463 | "sets": { 464 | "A": { 465 | "queries": [ 466 | { 467 | "query": "Set the column 'industry' of dataframe df1 to a value 5", 468 | "user": "u0" 469 | } 470 | ], 471 | "ios": [ 472 | { 473 | "inputs": [ 474 | "pd.DataFrame({'issueid': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, 'industry': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}})" 475 | ], 476 | "output": "pd.DataFrame({'issueid': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, 'industry': {0: 5, 1: 5, 2: 5, 3: 5, 4: 5}})", 477 | "invars": [ 478 | "df1" 479 | ], 480 | "outvar": "df1" 481 | } 482 | ], 483 | "solutions": [ 484 | [ 485 | "df1['industry'] = 5" 486 | ] 487 | ] 488 | } 489 | } 490 | }, 491 | "15": { 492 | "sets": { 493 | "A": { 494 | "queries": [ 495 | { 496 | "query": "Sort the rows by column 'B' in descending order in df1 and assign it to dfout", 497 | "user": "u0" 498 | } 499 | ], 500 | "ios": [ 501 | { 502 | "inputs": [ 503 | "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2, 3: 1, 4: 9, 5: -5, 6: 3}, 'B': {0: 2, 1: 4, 2: 6, 3: 1, 4: 1, 5: 7, 6: 3}})" 504 | ], 505 | "output": "pd.DataFrame({'A': {5: -5, 2: 2, 1: 0, 6: 3, 0: 5, 3: 1, 4: 9}, 'B': {5: 7, 2: 6, 1: 4, 6: 3, 0: 2, 3: 1, 4: 1}})", 506 | "invars": [ 507 | "df1" 508 | ], 509 | "outvar": "dfout" 510 | } 511 | ], 512 | "solutions": [ 513 | [ 514 | "dfout = df1.sort_values(by='B', ascending=False).copy()" 515 | ] 516 | ] 517 | } 518 | } 519 | }, 520 | "16": { 521 | "sets": { 522 | "A": { 523 | "queries": [ 524 | { 525 | "query": "Rename the 'key' column of the dataframe df1 to 'KEY' and return it to dfout", 526 | "user": "u0" 527 | } 528 | ], 529 | "ios": [ 530 | { 531 | "inputs": [ 532 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})" 533 | ], 534 | "output": "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'KEY': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})", 535 | "invars": [ 536 | "df1" 537 | ], 538 | "outvar": "dfout" 539 | } 540 | ], 541 | "solutions": [ 542 | [ 543 | "dfout = df1.rename(columns={\n 'key': 'KEY',\n})" 544 | ] 545 | ] 546 | } 547 | } 548 | }, 549 | "17": { 550 | "sets": { 551 | "A": { 552 | "queries": [ 553 | { 554 | "query": "put first two row of df1 in dfout", 555 | "user": "u0" 556 | } 557 | ], 558 | "ios": [ 559 | { 560 | "inputs": [ 561 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})" 562 | ], 563 | "output": "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0}, 'key': {'a': 1, 'b': 3}})", 564 | "invars": [ 565 | "df1" 566 | ], 567 | "outvar": "dfout" 568 | } 569 | ], 570 | "solutions": [ 571 | [ 572 | "dfout = df1[:2]" 573 | ] 574 | ] 575 | } 576 | } 577 | }, 578 | "18": { 579 | "sets": { 580 | "A": { 581 | "queries": [ 582 | { 583 | "query": "put third row of df1 in dfout", 584 | "user": "u0" 585 | } 586 | ], 587 | "ios": [ 588 | { 589 | "inputs": [ 590 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})" 591 | ], 592 | "output": "pd.DataFrame({'STID': {'c': 3.0}, 'key': {'c': 4}})", 593 | "invars": [ 594 | "df1" 595 | ], 596 | "outvar": "dfout" 597 | } 598 | ], 599 | "solutions": [ 600 | [ 601 | "dfout = df1[2:3]" 602 | ] 603 | ] 604 | } 605 | } 606 | }, 607 | "19": { 608 | "sets": { 609 | "A": { 610 | "queries": [ 611 | { 612 | "query": "put first and third row of df1 in dfout", 613 | "user": "u0" 614 | } 615 | ], 616 | "ios": [ 617 | { 618 | "inputs": [ 619 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})" 620 | ], 621 | "output": "pd.DataFrame({'STID': {'a': 1.0, 'c': 3.0}, 'key': {'a': 1, 'c': 4}})", 622 | "invars": [ 623 | "df1" 624 | ], 625 | "outvar": "dfout" 626 | }, 627 | { 628 | "inputs": [ 629 | "pd.DataFrame({'a': {0: 1.0, 1: 3.0, 2: 89.0, 3: -1.0, 4: -2.0}, 'EPS': {0: 2.0, 1: 89.0, 2: 5.0, 3: -1.0, 4: -2.0}})" 630 | ], 631 | "output": "pd.DataFrame({'a': {0: 1.0, 1: 89.0}, 'EPS': {0: 2.0, 1: 5.0}})", 632 | "invars": [ 633 | "df1" 634 | ], 635 | "outvar": "dfout" 636 | } 637 | ], 638 | "solutions": [ 639 | [ 640 | "dfout = df1.loc[['a','c']]" 641 | ] 642 | ] 643 | } 644 | } 645 | }, 646 | "20": { 647 | "sets": { 648 | "A": { 649 | "queries": [ 650 | { 651 | "query": "Remove last three rows from df1 and assign it to dfout", 652 | "user": "u0" 653 | } 654 | ], 655 | "ios": [ 656 | { 657 | "inputs": [ 658 | "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2, 3: 1, 4: 9, 5: -5, 6: 3}, 'B': {0: 2, 1: 4, 2: 6, 3: 1, 4: 1, 5: 7, 6: 3}})" 659 | ], 660 | "output": "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2, 3: 1}, 'B': {0: 2, 1: 4, 2: 6, 3: 1}})", 661 | "invars": [ 662 | "df1" 663 | ], 664 | "outvar": "dfout" 665 | }, 666 | { 667 | "inputs": [ 668 | "pd.DataFrame({'a': {0: 1.0, 1: 1.0, 2: 3.0, 3: 3.0, 4: 89.0, 5: -1.0, 6: -2.0, 7: -1.0, 8: -2.0}, 'EPS': {0: 1.0, 1: 2.0, 2: 89.0, 3: 90.0, 4: 5.0, 5: -1.0, 6: -2.0, 7: -1.0, 8: -2.0}})" 669 | ], 670 | "output": "pd.DataFrame({'a': {0: 1.0, 1: 1.0, 2: 3.0, 3: 3.0, 4: 89.0, 5: -1.0}, 'EPS': {0: 1.0, 1: 2.0, 2: 89.0, 3: 90.0, 4: 5.0, 5: -1.0}})", 671 | "invars": [ 672 | "df1" 673 | ], 674 | "outvar": "dfout" 675 | } 676 | ], 677 | "solutions": [ 678 | [ 679 | "dfout = df1.drop(labels=[4, 5, 6], errors='ignore')", 680 | "dfout = df1.head(len(df1)-3)", 681 | "dfout = df1.drop(labels=[4, 5, 6])" 682 | ] 683 | ] 684 | } 685 | } 686 | }, 687 | "21": { 688 | "sets": { 689 | "A": { 690 | "queries": [ 691 | { 692 | "query": "take last three rows in df1 and assign it to dfout", 693 | "user": "u0" 694 | } 695 | ], 696 | "ios": [ 697 | { 698 | "inputs": [ 699 | "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2, 3: 1, 4: 9, 5: -5, 6: 3}, 'B': {0: 2, 1: 4, 2: 6, 3: 1, 4: 1, 5: 7, 6: 3}})" 700 | ], 701 | "output": "pd.DataFrame({'A': {4: 9, 5: -5, 6: 3}, 'B': {4: 1, 5: 7, 6: 3}})", 702 | "invars": [ 703 | "df1" 704 | ], 705 | "outvar": "dfout" 706 | } 707 | ], 708 | "solutions": [ 709 | [ 710 | "dfout = df1[(- 3):]" 711 | ] 712 | ] 713 | } 714 | } 715 | }, 716 | "22": { 717 | "sets": { 718 | "A": { 719 | "queries": [ 720 | { 721 | "query": "count the number of null values in df1 and return it to dfout", 722 | "user": "u0" 723 | } 724 | ], 725 | "ios": [ 726 | { 727 | "inputs": [ 728 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})" 729 | ], 730 | "output": "1", 731 | "invars": [ 732 | "df1" 733 | ], 734 | "outvar": "dfout" 735 | } 736 | ], 737 | "solutions": [ 738 | [ 739 | "dfout = df1.isnull().sum().sum()" 740 | ] 741 | ] 742 | } 743 | } 744 | }, 745 | "23": { 746 | "sets": { 747 | "A": { 748 | "queries": [ 749 | { 750 | "query": "drop rows with null values in df1 and return it to dfout", 751 | "user": "u0" 752 | } 753 | ], 754 | "ios": [ 755 | { 756 | "inputs": [ 757 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})" 758 | ], 759 | "output": "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0}, 'key': {'a': 1, 'b': 3, 'c': 4}})", 760 | "invars": [ 761 | "df1" 762 | ], 763 | "outvar": "dfout" 764 | } 765 | ], 766 | "solutions": [ 767 | [ 768 | "dfout = df1.dropna()" 769 | ] 770 | ] 771 | } 772 | } 773 | }, 774 | "24": { 775 | "sets": { 776 | "A": { 777 | "queries": [ 778 | { 779 | "query": "replace null values in df1 with 4.0 and return it to dfout", 780 | "user": "u0" 781 | } 782 | ], 783 | "ios": [ 784 | { 785 | "inputs": [ 786 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})" 787 | ], 788 | "output": "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': 4.0}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})", 789 | "invars": [ 790 | "df1" 791 | ], 792 | "outvar": "dfout" 793 | } 794 | ], 795 | "solutions": [ 796 | [ 797 | "dfout = df1.fillna(4.0)", 798 | "dfout = df1.replace(np.nan, 4.0)" 799 | ] 800 | ] 801 | } 802 | } 803 | }, 804 | "25": { 805 | "sets": { 806 | "A": { 807 | "queries": [ 808 | { 809 | "query": "replace null values in df1 with mean of the column and return it to dfout", 810 | "user": "u0" 811 | } 812 | ], 813 | "ios": [ 814 | { 815 | "inputs": [ 816 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1.0, 'b': None, 'c': 4.0, 'd': 7.0}})" 817 | ], 818 | "output": "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': 2.0}, 'key': {'a': 1.0, 'b': 4.0, 'c': 4.0, 'd': 7.0}})", 819 | "invars": [ 820 | "df1" 821 | ], 822 | "outvar": "dfout" 823 | } 824 | ], 825 | "solutions": [ 826 | [ 827 | "dfout = df1.replace(np.nan, df1.mean())", 828 | "dfout = df1.fillna(df1.mean())" 829 | ] 830 | ] 831 | } 832 | } 833 | }, 834 | "26": { 835 | "sets": { 836 | "A": { 837 | "queries": [ 838 | { 839 | "query": "find mean and median of columns in df1 and save it to dfout", 840 | "user": "u0" 841 | } 842 | ], 843 | "ios": [ 844 | { 845 | "inputs": [ 846 | "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2, 3: 1, 4: 9, 5: -5, 6: 3}, 'B': {0: 2, 1: 4, 2: 6, 3: 1, 4: 1, 5: 7, 6: 3}})" 847 | ], 848 | "output": "pd.DataFrame({'A': {0: 2.142857142857143, 1: 2.0}, 'B': {0: 3.4285714285714284, 1: 3.0}})", 849 | "invars": [ 850 | "df1" 851 | ], 852 | "outvar": "dfout" 853 | } 854 | ], 855 | "solutions": [ 856 | [ 857 | "dfout = pd.DataFrame([df1.mean(), df1.median()])" 858 | ] 859 | ] 860 | } 861 | } 862 | }, 863 | "27": { 864 | "sets": { 865 | "A": { 866 | "queries": [ 867 | { 868 | "query": "Filter rows where there is more than one NaN.", 869 | "user": "u0" 870 | } 871 | ], 872 | "ios": [ 873 | { 874 | "inputs": [ 875 | "pd.DataFrame({'Score1': {0: 100.0, 1: 90.0, 2: None, 3: 95.0}, 'Score2': {0: 30.0, 1: None, 2: 45.0, 3: 56.0}, 'Score3': {0: 52, 1: 40, 2: 80, 3: 98}, 'Score4': {0: None, 1: None, 2: None, 3: 65.0}})" 876 | ], 877 | "output": "pd.DataFrame({'Score1': {1: 90.0, 2: None}, 'Score2': {1: None, 2: 45.0}, 'Score3': {1: 40, 2: 80}, 'Score4': {1: None, 2: None}})", 878 | "invars": [ 879 | "df1" 880 | ], 881 | "outvar": "dfout" 882 | }, 883 | { 884 | "inputs": [ 885 | "pd.DataFrame({'a': {0: None, 1: 3.0, 2: 89.0, 3: None, 4: -2.0, 5: -1.0, 6: None}, 'EPS': {0: 2.0, 1: 89.0, 2: 5.0, 3: None, 4: -2.0, 5: -1.0, 6: None}})" 886 | ], 887 | "output": "pd.DataFrame({'a': {0: None, 1: None}, 'EPS': {0: None, 1: None}})", 888 | "invars": [ 889 | "df1" 890 | ], 891 | "outvar": "dfout" 892 | } 893 | ], 894 | "solutions": [ 895 | [ 896 | "dfout = df1.loc[(df1.isnull().sum(axis=1) > 1), :]" 897 | ] 898 | ] 899 | } 900 | } 901 | }, 902 | "28": { 903 | "sets": { 904 | "A": { 905 | "queries": [ 906 | { 907 | "query": "Set 'A' as 5 in 6th row of df1", 908 | "user": "u0" 909 | } 910 | ], 911 | "ios": [ 912 | { 913 | "inputs": [ 914 | "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2, 3: 1, 4: 9, 5: -5, 6: 3}, 'B': {0: 2, 1: 4, 2: 6, 3: 1, 4: 1, 5: 7, 6: 3}})" 915 | ], 916 | "output": "pd.DataFrame({'A': {0: 5, 1: 0, 2: 2, 3: 1, 4: 9, 5: 5, 6: 3}, 'B': {0: 2, 1: 4, 2: 6, 3: 1, 4: 1, 5: 7, 6: 3}})", 917 | "invars": [ 918 | "df1" 919 | ], 920 | "outvar": "dfout" 921 | } 922 | ], 923 | "solutions": [ 924 | "df1.loc[5, 'A'] = 5" 925 | ] 926 | } 927 | } 928 | }, 929 | "29": { 930 | "sets": { 931 | "A": { 932 | "queries": [ 933 | { 934 | "query": "Remove rows with null values from df1 and return it to dfout", 935 | "user": "u0" 936 | } 937 | ], 938 | "ios": [ 939 | { 940 | "inputs": [ 941 | "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0, 'd': None}, 'key': {'a': 1, 'b': 3, 'c': 4, 'd': 7}})" 942 | ], 943 | "output": "pd.DataFrame({'STID': {'a': 1.0, 'b': 2.0, 'c': 3.0}, 'key': {'a': 1, 'b': 3, 'c': 4}})", 944 | "invars": [ 945 | "df1" 946 | ], 947 | "outvar": "dfout" 948 | } 949 | ], 950 | "solutions": [ 951 | [ 952 | "dfout = df1.dropna()" 953 | ] 954 | ] 955 | } 956 | } 957 | }, 958 | "30": { 959 | "sets": { 960 | "A": { 961 | "queries": [ 962 | { 963 | "query": "Concatenate two data frames and drop duplicates", 964 | "user": "u0" 965 | } 966 | ], 967 | "ios": [ 968 | { 969 | "inputs": [ 970 | "pd.DataFrame({'A': {0: 1, 1: 3}, 'B': {0: 2, 1: 1}})", 971 | "pd.DataFrame({'A': {0: 5, 1: 3}, 'B': {0: 6, 1: 1}})" 972 | ], 973 | "output": "pd.DataFrame({'A': {0: 1, 1: 3, 2: 5}, 'B': {0: 2, 1: 1, 2: 6}})", 974 | "invars": [ 975 | "df1", 976 | "df2" 977 | ], 978 | "outvar": "dfout" 979 | } 980 | ], 981 | "solutions": [ 982 | [ 983 | "dfout = pd.concat([df1, df2]).drop_duplicates().reset_index(drop=True)" 984 | ] 985 | ] 986 | } 987 | } 988 | }, 989 | "31": { 990 | "sets": { 991 | "A": { 992 | "queries": [ 993 | { 994 | "query": "Drop duplicates from dataframe and reindex.", 995 | "user": "u0" 996 | } 997 | ], 998 | "ios": [ 999 | { 1000 | "inputs": [ 1001 | "pd.DataFrame({'A': {0: 1, 1: 3, 2: 5, 3: 3}, 'B': {0: 2, 1: 1, 2: 6, 3: 1}})" 1002 | ], 1003 | "output": "pd.DataFrame({'A': {0: 1, 1: 3, 2: 5}, 'B': {0: 2, 1: 1, 2: 6}})", 1004 | "invars": [ 1005 | "df1" 1006 | ], 1007 | "outvar": "dfout" 1008 | } 1009 | ], 1010 | "solutions": [ 1011 | [ 1012 | "dfout = df1.drop_duplicates(keep='first')", 1013 | "dfout = df1.drop_duplicates(keep='first').reindex(columns=['A', 'B'])" 1014 | ] 1015 | ] 1016 | } 1017 | } 1018 | }, 1019 | "32": { 1020 | "sets": { 1021 | "A": { 1022 | "queries": [ 1023 | { 1024 | "query": "Transpose the dataframe", 1025 | "user": "u0" 1026 | } 1027 | ], 1028 | "ios": [ 1029 | { 1030 | "inputs": [ 1031 | "pd.DataFrame({'A': {'a': 1, 'b': 2, 'c': 3}, 'B': {'a': 4, 'b': 5, 'c': 6}, 'C': {'a': 7, 'b': 8, 'c': 9}})" 1032 | ], 1033 | "output": "pd.DataFrame({'a': {'A': 1, 'B': 4, 'C': 7}, 'b': {'A': 2, 'B': 5, 'C': 8}, 'c': {'A': 3, 'B': 6, 'C': 9}})", 1034 | "invars": [ 1035 | "df1" 1036 | ], 1037 | "outvar": "dfout" 1038 | } 1039 | ], 1040 | "solutions": [ 1041 | [ 1042 | "dfout = df1.T" 1043 | ] 1044 | ] 1045 | } 1046 | } 1047 | }, 1048 | "33": { 1049 | "sets": { 1050 | "A": { 1051 | "queries": [ 1052 | { 1053 | "query": "Select rows where location is 'a'", 1054 | "user": "u0" 1055 | } 1056 | ], 1057 | "ios": [ 1058 | { 1059 | "inputs": [ 1060 | "pd.DataFrame({'date': {0: 20130101, 1: 20130101, 2: 20130102}, 'location': {0: 'a', 1: 'a', 2: 'c'}})" 1061 | ], 1062 | "output": "pd.DataFrame({'date': {0: 20130101, 1: 20130101}, 'location': {0: 'a', 1: 'a'}})", 1063 | "invars": [ 1064 | "df1" 1065 | ], 1066 | "outvar": "dfout" 1067 | } 1068 | ], 1069 | "solutions": [ 1070 | [ 1071 | "dfout = df1[(df1['location'] == 'a')]" 1072 | ] 1073 | ] 1074 | } 1075 | } 1076 | }, 1077 | "34": { 1078 | "sets": { 1079 | "A": { 1080 | "queries": [ 1081 | { 1082 | "query": "Filter the columns from dataframe where type equal to 'float64'", 1083 | "user": "u0" 1084 | } 1085 | ], 1086 | "ios": [ 1087 | { 1088 | "inputs": [ 1089 | "pd.DataFrame({0: {0: 1}, 1: {0: 'a'}, 2: {0: 2.0}})" 1090 | ], 1091 | "output": "pd.DataFrame({0: {0: 1}, 1: {0: 'a'}})", 1092 | "invars": [ 1093 | "df1" 1094 | ], 1095 | "outvar": "dfout" 1096 | }, 1097 | { 1098 | "inputs": [ 1099 | "pd.DataFrame({'A': {0: 1.1, 1: 1.1, 2: 1.1}, 'B': {0: 2.3, 1: 5.0, 2: 5.0}, 'C': {0: 5.7, 1: 5.7, 2: 5.7}, 'D': {0: 0, 1: 0, 2: 100}, 'E': {0: 21, 1: 21, 2: 99}, 'F': {0: 55.0, 1: 55.0, 2: 105.5}, 'G': {0: '333', 1: '444', 2: '444'}})" 1100 | ], 1101 | "output": "pd.DataFrame({'D': {0: 0, 1: 0, 2: 100}, 'E': {0: 21, 1: 21, 2: 99}, 'G': {0: '333', 1: '444', 2: '444'}})", 1102 | "invars": [ 1103 | "df1" 1104 | ], 1105 | "outvar": "dfout" 1106 | } 1107 | ], 1108 | "solutions": [ 1109 | [ 1110 | "dfout = df1.drop(labels=[2], axis=1)", 1111 | "dfout = df1.loc[:, df1.dtypes != 'float64']" 1112 | ] 1113 | ] 1114 | } 1115 | } 1116 | }, 1117 | "35": { 1118 | "sets": { 1119 | "A": { 1120 | "queries": [ 1121 | { 1122 | "query": "Check if all the values of column 'C1' in the dataframe contains values 1, 2 or 3.", 1123 | "user": "u0" 1124 | } 1125 | ], 1126 | "ios": [ 1127 | { 1128 | "inputs": [ 1129 | "pd.DataFrame({'C1': {0: 1, 1: 2, 2: 2, 3: 1, 4: 3}, 'C2': {0: 'E2', 1: 'E4', 2: 'E2', 3: 'E4', 4: 'E6'}})" 1130 | ], 1131 | "output": "_s = pd.Series([True, True, True, True, True])\n_s.name = \"C1\"\n_s", 1132 | "invars": [ 1133 | "df1" 1134 | ], 1135 | "outvar": "dfout" 1136 | } 1137 | ], 1138 | "solutions": [ 1139 | [ 1140 | "dfout = df1.C1.isin(['1', '2', '3'])" 1141 | ] 1142 | ] 1143 | } 1144 | } 1145 | }, 1146 | "36": { 1147 | "sets": { 1148 | "A": { 1149 | "queries": [ 1150 | { 1151 | "query": "Convert the 'foo' column in the dataframe to numeric ignoring the errors.", 1152 | "user": "u0" 1153 | } 1154 | ], 1155 | "ios": [ 1156 | { 1157 | "inputs": [ 1158 | "pd.DataFrame({'foo': {0: '1', 1: '2.0', 2: '-', 3: '-', 4: '3.447'}})" 1159 | ], 1160 | "output": "pd.DataFrame({'foo': {0: 1.0, 1: 2.0, 2: None, 3: None, 4: 3.447}})", 1161 | "invars": [ 1162 | "df1" 1163 | ], 1164 | "outvar": "df1" 1165 | } 1166 | ], 1167 | "solutions": [ 1168 | [ 1169 | "df1['foo'] = pd.to_numeric(df1['foo'], errors='coerce')" 1170 | ] 1171 | ] 1172 | } 1173 | } 1174 | }, 1175 | "37": { 1176 | "sets": { 1177 | "A": { 1178 | "queries": [ 1179 | { 1180 | "query": "Convert the 'foo' column in the dataframe to numeric", 1181 | "user": "u0" 1182 | } 1183 | ], 1184 | "ios": [ 1185 | { 1186 | "inputs": [ 1187 | "pd.DataFrame({'foo': {0: '1', 1: '2.0', 2: '-', 3: '-', 4: '3.447'}})" 1188 | ], 1189 | "output": "pd.DataFrame({'foo': {0: 1.0, 1: 2.0, 2: None, 3: None, 4: 3.447}})", 1190 | "invars": [ 1191 | "df1" 1192 | ], 1193 | "outvar": "df1" 1194 | } 1195 | ], 1196 | "solutions": [ 1197 | [ 1198 | "df1['foo'] = pd.to_numeric(df1['foo'], errors='coerce')" 1199 | ] 1200 | ] 1201 | } 1202 | } 1203 | }, 1204 | "38": { 1205 | "sets": { 1206 | "A": { 1207 | "queries": [ 1208 | { 1209 | "query": "Move column 'deaths' to be the first column of df", 1210 | "user": "u0" 1211 | } 1212 | ], 1213 | "ios": [ 1214 | { 1215 | "inputs": [ 1216 | "pd.DataFrame({'name': {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'}, 'val': {0: 12, 1: 11, 2: 17, 3: 14, 4: 5}, 'deaths': {0: 45, 1: 92, 2: 22, 3: 39, 4: 79}, 'ix': {0: 2, 1: 3, 2: 2, 3: 2, 4: 4}})" 1217 | ], 1218 | "output": "pd.DataFrame({'deaths': {0: 45, 1: 92, 2: 22, 3: 39, 4: 79}, 'name': {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'}, 'val': {0: 12, 1: 11, 2: 17, 3: 14, 4: 5}, 'ix': {0: 2, 1: 3, 2: 2, 3: 2, 4: 4}})", 1219 | "invars": [ 1220 | "df", 1221 | "x" 1222 | ], 1223 | "outvar": "df" 1224 | }, 1225 | { 1226 | "inputs": [ 1227 | "pd.DataFrame({'one': {0: 1, 1: 4, 2: 7, 3: 10}, 'two': {0: 2, 1: 5, 2: 8, 3: 11}, 'deaths': {0: 3, 1: 6, 2: 9, 3: 12}})" 1228 | ], 1229 | "output": "pd.DataFrame({'deaths': {0: 3, 1: 6, 2: 9, 3: 12}, 'one': {0: 1, 1: 4, 2: 7, 3: 10}, 'two': {0: 2, 1: 5, 2: 8, 3: 11}})", 1230 | "invars": [ 1231 | "df", 1232 | "x" 1233 | ], 1234 | "outvar": "df" 1235 | } 1236 | ], 1237 | "solutions": [ 1238 | "df = df[['deaths'] + [x for x in df.columns if x!='deaths']]\n" 1239 | ] 1240 | } 1241 | } 1242 | }, 1243 | "39": { 1244 | "sets": { 1245 | "A": { 1246 | "queries": [ 1247 | { 1248 | "query": "Convert column 'time' of df to pandas datetime", 1249 | "user": "u0" 1250 | } 1251 | ], 1252 | "ios": [ 1253 | { 1254 | "inputs": [ 1255 | "pd.DataFrame({'time': {0: '04:12:40', 1: '04:17:01', 2: '04:17:01', 3: '05:12:56', 4: '05:33:12', 5: '05:38:33', 6: '05:48:52', 7: '05:50:22', 8: '04:17:01'}, 'val': {0: 30, 1: 22, 2: 23, 3: 4, 4: 5, 5: 18, 6: 13, 7: 13, 8: 18}})" 1256 | ], 1257 | "output": "pd.DataFrame({'time': {0: pd.Timestamp('2021-09-03 04:12:40'), 1: pd.Timestamp('2021-09-03 04:17:01'), 2: pd.Timestamp('2021-09-03 04:17:01'), 3: pd.Timestamp('2021-09-03 05:12:56'), 4: pd.Timestamp('2021-09-03 05:33:12'), 5: pd.Timestamp('2021-09-03 05:38:33'), 6: pd.Timestamp('2021-09-03 05:48:52'), 7: pd.Timestamp('2021-09-03 05:50:22'), 8: pd.Timestamp('2021-09-03 04:17:01')}, 'val': {0: 30, 1: 22, 2: 23, 3: 4, 4: 5, 5: 18, 6: 13, 7: 13, 8: 18}})", 1258 | "invars": [ 1259 | "df" 1260 | ], 1261 | "outvar": "df" 1262 | } 1263 | ], 1264 | "solutions": [ 1265 | "df['time'] = pd.to_datetime(df['time'])\n" 1266 | ] 1267 | } 1268 | } 1269 | }, 1270 | "40": { 1271 | "sets": { 1272 | "A": { 1273 | "queries": [ 1274 | { 1275 | "query": "remove rows with duplicates occuring consecutively in column 'time' of dataframe df", 1276 | "user": "u0" 1277 | } 1278 | ], 1279 | "ios": [ 1280 | { 1281 | "inputs": [ 1282 | "pd.DataFrame({'time': {0: pd.Timestamp('2021-09-03 04:12:40'), 1: pd.Timestamp('2021-09-03 04:17:01'), 2: pd.Timestamp('2021-09-03 04:17:01'), 3: pd.Timestamp('2021-09-03 05:12:56'), 4: pd.Timestamp('2021-09-03 05:33:12'), 5: pd.Timestamp('2021-09-03 05:38:33'), 6: pd.Timestamp('2021-09-03 05:48:52'), 7: pd.Timestamp('2021-09-03 05:50:22'), 8: pd.Timestamp('2021-09-03 04:17:01')}, 'val': {0: 30, 1: 22, 2: 23, 3: 4, 4: 5, 5: 18, 6: 13, 7: 13, 8: 18}})" 1283 | ], 1284 | "output": "pd.DataFrame({'time': {0: pd.Timestamp('2021-09-03 04:12:40'), 1: pd.Timestamp('2021-09-03 04:17:01'), 3: pd.Timestamp('2021-09-03 05:12:56'), 4: pd.Timestamp('2021-09-03 05:33:12'), 5: pd.Timestamp('2021-09-03 05:38:33'), 6: pd.Timestamp('2021-09-03 05:48:52'), 7: pd.Timestamp('2021-09-03 05:50:22'), 8: pd.Timestamp('2021-09-03 04:17:01')}, 'val': {0: 30, 1: 22, 3: 4, 4: 5, 5: 18, 6: 13, 7: 13, 8: 18}})", 1285 | "invars": [ 1286 | "df" 1287 | ], 1288 | "outvar": "dfout" 1289 | } 1290 | ], 1291 | "solutions": [ 1292 | "dfout = df[df['time'].diff()!=pd.Timedelta(\"0\")]\n" 1293 | ] 1294 | } 1295 | } 1296 | }, 1297 | "41": { 1298 | "sets": { 1299 | "A": { 1300 | "queries": [ 1301 | { 1302 | "query": "drop rows logged at less than 10 minute 'time' interval in column df", 1303 | "user": "u0" 1304 | } 1305 | ], 1306 | "ios": [ 1307 | { 1308 | "inputs": [ 1309 | "pd.DataFrame({'time': {0: pd.Timestamp('2021-09-03 04:12:40'), 1: pd.Timestamp('2021-09-03 04:17:01'), 2: None, 3: pd.Timestamp('2021-09-03 05:12:56'), 4: pd.Timestamp('2021-09-03 05:33:12'), 5: pd.Timestamp('2021-09-03 05:38:33'), 6: pd.Timestamp('2021-09-03 05:48:52'), 7: pd.Timestamp('2021-09-03 05:50:22'), 8: pd.Timestamp('2021-09-03 04:17:01')}, 'val': {0: 30, 1: 22, 2: 23, 3: 4, 4: 5, 5: 18, 6: 13, 7: 13, 8: 18}})" 1310 | ], 1311 | "output": "pd.DataFrame({'time': {1: pd.Timestamp('2021-09-03 04:17:01'), 5: pd.Timestamp('2021-09-03 05:38:33'), 7: pd.Timestamp('2021-09-03 05:50:22'), 8: pd.Timestamp('2021-09-03 04:17:01')}, 'val': {1: 22, 5: 18, 7: 13, 8: 18}})", 1312 | "invars": [ 1313 | "df" 1314 | ], 1315 | "outvar": "dfout" 1316 | }, 1317 | { 1318 | "inputs": [ 1319 | "pd.DataFrame({'time': {0: pd.Timestamp('2021-09-03 04:12:40'), 1: pd.Timestamp('2021-09-03 04:17:01'), 2: pd.Timestamp('2021-09-03 04:17:01'), 3: pd.Timestamp('2021-09-03 05:12:56'), 4: pd.Timestamp('2021-09-03 05:33:12'), 5: pd.Timestamp('2021-09-03 05:38:33'), 6: pd.Timestamp('2021-09-03 05:48:52'), 7: pd.Timestamp('2021-09-03 05:50:22'), 8: pd.Timestamp('2021-09-03 05:12:56'), 9: pd.Timestamp('2021-09-03 05:33:12'), 10: pd.Timestamp('2021-09-03 05:38:33'), 11: pd.Timestamp('2021-09-03 05:48:52'), 12: pd.Timestamp('2021-09-03 05:50:22'), 13: pd.Timestamp('2021-09-03 04:17:01')}, 'val': {0: 30, 1: 22, 2: 23, 3: 4, 4: 5, 5: 18, 6: 13, 7: 13, 8: 4, 9: 5, 10: 18, 11: 13, 12: 13, 13: 18}})" 1320 | ], 1321 | "output": "pd.DataFrame({'time': {1: pd.Timestamp('2021-09-03 04:17:01'), 2: pd.Timestamp('2021-09-03 04:17:01'), 5: pd.Timestamp('2021-09-03 05:38:33'), 7: pd.Timestamp('2021-09-03 05:50:22'), 8: pd.Timestamp('2021-09-03 05:12:56'), 10: pd.Timestamp('2021-09-03 05:38:33'), 12: pd.Timestamp('2021-09-03 05:50:22'), 13: pd.Timestamp('2021-09-03 04:17:01')}, 'val': {1: 22, 2: 23, 5: 18, 7: 13, 8: 4, 10: 18, 12: 13, 13: 18}})", 1322 | "invars": [ 1323 | "df" 1324 | ], 1325 | "outvar": "dfout" 1326 | } 1327 | ], 1328 | "solutions": [ 1329 | "dfout = df[df['time'].diff()3)]\n" 1786 | ] 1787 | } 1788 | } 1789 | }, 1790 | "58": { 1791 | "sets": { 1792 | "A": { 1793 | "queries": [ 1794 | { 1795 | "query": "Update column 'present' to True in dataframe df where string 'dog' occurs inside column 'pet'", 1796 | "user": "u0" 1797 | } 1798 | ], 1799 | "ios": [ 1800 | { 1801 | "inputs": [ 1802 | "pd.DataFrame({'pet': {0: 'dog-a', 1: 'dog-b', 2: 'cat', 3: 'goldfish', 4: 'c-dog', 5: 'hamster'}, 'present': {0: False, 1: False, 2: False, 3: False, 4: False, 5: False}})" 1803 | ], 1804 | "output": "pd.DataFrame({'pet': {0: 'dog-a', 1: 'dog-b', 2: 'cat', 3: 'goldfish', 4: 'c-dog', 5: 'hamster'}, 'present': {0: True, 1: True, 2: False, 3: False, 4: True, 5: False}})", 1805 | "invars": [ 1806 | "df" 1807 | ], 1808 | "outvar": "df" 1809 | } 1810 | ], 1811 | "solutions": [ 1812 | "df.loc[df.pet.str.contains('dog'),'present'] = True\n" 1813 | ] 1814 | } 1815 | } 1816 | }, 1817 | "59": { 1818 | "sets": { 1819 | "A": { 1820 | "queries": [ 1821 | { 1822 | "query": "Increase 'count' column by 1 when column 'pet' contains substring 'dog' or 'cat'", 1823 | "user": "u0" 1824 | } 1825 | ], 1826 | "ios": [ 1827 | { 1828 | "inputs": [ 1829 | "pd.DataFrame({'pet': {0: 'dog-a', 1: 'dog-b', 2: 'cat', 3: 'goldfish', 4: 'c-dog', 5: 'hamster'}, 'count': {0: 12, 1: 15, 2: 4, 3: 4, 4: 15, 5: 2}})" 1830 | ], 1831 | "output": "pd.DataFrame({'pet': {0: 'dog-a', 1: 'dog-b', 2: 'cat', 3: 'goldfish', 4: 'c-dog', 5: 'hamster'}, 'count': {0: 13, 1: 16, 2: 5, 3: 4, 4: 16, 5: 2}})", 1832 | "invars": [ 1833 | "df" 1834 | ], 1835 | "outvar": "df" 1836 | } 1837 | ], 1838 | "solutions": [ 1839 | "df.loc[df.pet.str.contains('dog|cat'),'count'] += 1\n" 1840 | ] 1841 | } 1842 | } 1843 | }, 1844 | "60": { 1845 | "sets": { 1846 | "A": { 1847 | "queries": [ 1848 | { 1849 | "query": "For the series data with multiple indices, create dataframe df with columns as the outermost level of index", 1850 | "user": "u0" 1851 | } 1852 | ], 1853 | "ios": [ 1854 | { 1855 | "inputs": [ 1856 | "pd.Series([1, 2, 3, 4])" 1857 | ], 1858 | "output": "pd.DataFrame({'one': {'a': 1, 'b': 2}, 'two': {'a': 3, 'b': 4}})", 1859 | "invars": [ 1860 | "data" 1861 | ], 1862 | "outvar": "out" 1863 | } 1864 | ], 1865 | "solutions": [ 1866 | "out = data.unstack(level=0)\n" 1867 | ] 1868 | } 1869 | } 1870 | }, 1871 | "61": { 1872 | "sets": { 1873 | "A": { 1874 | "queries": [ 1875 | { 1876 | "query": "For df with multiindex columns, remove column 'outer_col' from df", 1877 | "user": "u0" 1878 | } 1879 | ], 1880 | "ios": [ 1881 | { 1882 | "inputs": [ 1883 | "_df = pd.DataFrame({('c', 'e'): {(1, 2): 3, (5, 6): 7, (9, 10): 11}, ('d', 'f'): {(1, 2): 4, (5, 6): 8, (9, 10): 12}})\n_df.columns.names = ['outer_col', 'inner_col']\n_df.index.names = ['a', 'b']\n_df" 1884 | ], 1885 | "output": "_df = pd.DataFrame({'e': {(1, 2): 3, (5, 6): 7, (9, 10): 11}, 'f': {(1, 2): 4, (5, 6): 8, (9, 10): 12}})\n_df.columns.names = ['inner_col']\n_df.index.names = ['a', 'b']\n_df", 1886 | "invars": [ 1887 | "df" 1888 | ], 1889 | "outvar": "dfout" 1890 | } 1891 | ], 1892 | "solutions": [ 1893 | "dfout = df.droplevel('outer_col', axis=1)\n" 1894 | ] 1895 | } 1896 | } 1897 | }, 1898 | "62": { 1899 | "sets": { 1900 | "A": { 1901 | "queries": [ 1902 | { 1903 | "query": "compute number of times 'mouse' is present in dataframe df", 1904 | "user": "u0" 1905 | } 1906 | ], 1907 | "ios": [ 1908 | { 1909 | "inputs": [ 1910 | "pd.DataFrame({'pet1': {0: 'mouse', 1: 'mouse', 2: 'cat', 3: 'goldfish', 4: 'bad-mouse', 5: 'hamster', 6: 'lion', 7: 'tiger-mouse', 8: 'mouse'}, 'pet2': {0: 12, 1: 'mouse', 2: 'dog', 3: 'mouse', 4: 'snake', 5: 'mouse', 6: 'tiger', 7: 'mouse', 8: 'mouse'}})" 1911 | ], 1912 | "output": "np.int64(8)", 1913 | "invars": [ 1914 | "df" 1915 | ], 1916 | "outvar": "out" 1917 | } 1918 | ], 1919 | "solutions": [ 1920 | "out = (df=='mouse').sum().sum()\n" 1921 | ] 1922 | } 1923 | } 1924 | }, 1925 | "63": { 1926 | "sets": { 1927 | "A": { 1928 | "queries": [ 1929 | { 1930 | "query": "compute number rows in which 'mouse' is present in dataframe df", 1931 | "user": "u0" 1932 | } 1933 | ], 1934 | "ios": [ 1935 | { 1936 | "inputs": [ 1937 | "pd.DataFrame({'pet1': {0: 'mouse', 1: 'mouse', 2: 'cat', 3: 'goldfish', 4: 'bad-mouse', 5: 'hamster', 6: 'lion', 7: 'tiger-mouse', 8: 'mouse'}, 'pet2': {0: 12, 1: 'mouse', 2: 'dog', 3: 'mouse', 4: 'snake', 5: 'mouse', 6: 'tiger', 7: 'mouse', 8: 'mouse'}})" 1938 | ], 1939 | "output": "np.int64(6)", 1940 | "invars": [ 1941 | "df" 1942 | ], 1943 | "outvar": "out" 1944 | } 1945 | ], 1946 | "solutions": [ 1947 | "out = (df=='mouse').any(1).sum()\n" 1948 | ] 1949 | } 1950 | } 1951 | }, 1952 | "64": { 1953 | "sets": { 1954 | "A": { 1955 | "queries": [ 1956 | { 1957 | "query": "find number of elements in column 'phone' of df which are comprised totally of digits", 1958 | "user": "u0" 1959 | } 1960 | ], 1961 | "ios": [ 1962 | { 1963 | "inputs": [ 1964 | "pd.DataFrame({'phone': {0: '911', 1: '888', 2: '1-2-3', 3: '+00', 4: 'abc', 5: '334', 6: '00000', 7: '9/9/9', 8: '12/12/2012'}})" 1965 | ], 1966 | "output": "np.int64(4)", 1967 | "invars": [ 1968 | "df" 1969 | ], 1970 | "outvar": "out" 1971 | } 1972 | ], 1973 | "solutions": [ 1974 | "out = df['phone'].str.isdigit().sum()\n" 1975 | ] 1976 | } 1977 | } 1978 | }, 1979 | "65": { 1980 | "sets": { 1981 | "A": { 1982 | "queries": [ 1983 | { 1984 | "query": "Select the rows of df where column 'X' lies in integers between inclusive 4-10", 1985 | "user": "u0" 1986 | } 1987 | ], 1988 | "ios": [ 1989 | { 1990 | "inputs": [ 1991 | "pd.DataFrame({'X': {0: 2.0, 1: 5.0, 2: 5.5, 3: 8.0, 4: 16.0, 5: 12.0, 6: 10.0, 7: 11.0, 8: 4.0, 9: 21.0, 10: 20.0, 11: 4.0, 12: 14.0}, 'Y': {0: 5, 1: 7, 2: 4, 3: 2, 4: 7, 5: 12, 6: 0, 7: 1, 8: 4, 9: 2, 10: 3, 11: 10, 12: 3}})" 1992 | ], 1993 | "output": "pd.DataFrame({'X': {1: 5.0, 3: 8.0, 6: 10.0, 8: 4.0, 11: 4.0}, 'Y': {1: 7, 3: 2, 6: 0, 8: 4, 11: 10}})", 1994 | "invars": [ 1995 | "df", 1996 | "list", 1997 | "range" 1998 | ], 1999 | "outvar": "out" 2000 | }, 2001 | { 2002 | "inputs": [ 2003 | "pd.DataFrame({'X': {0: 2.0, 1: 5.0, 2: 5.5, 3: 8.0, 4: 20.0, 5: 4.0, 6: 14.0}, 'Z': {0: 5, 1: 7, 2: 4, 3: 2, 4: 3, 5: 10, 6: 3}})" 2004 | ], 2005 | "output": "pd.DataFrame({'X': {0: 5, 1: 8, 2: 4}, 'Z': {0: 7, 1: 2, 2: 10}})", 2006 | "invars": [ 2007 | "df", 2008 | "list", 2009 | "range" 2010 | ], 2011 | "outvar": "out" 2012 | } 2013 | ], 2014 | "solutions": [ 2015 | "out = df[df['X'].isin(list(range(4,11)))]\n" 2016 | ] 2017 | } 2018 | } 2019 | }, 2020 | "66": { 2021 | "sets": { 2022 | "A": { 2023 | "queries": [ 2024 | { 2025 | "query": "Select the rows of df where column 'Y' lies in integers between inclusive 4-10 or 14-20", 2026 | "user": "u0" 2027 | } 2028 | ], 2029 | "ios": [ 2030 | { 2031 | "inputs": [ 2032 | "pd.DataFrame({'Y': {0: 2.0, 1: 5.0, 2: 5.5, 3: 8.0, 4: 16.0, 5: 12.0, 6: 10.0, 7: 11.0, 8: 4.0, 9: 21.0, 10: 20.0, 11: 10.0, 12: 14.0}, 'Z': {0: 5, 1: 7, 2: 4, 3: 2, 4: 7, 5: 12, 6: 0, 7: 1, 8: 4, 9: 2, 10: 3, 11: 10, 12: 3}})" 2033 | ], 2034 | "output": "pd.DataFrame({'Y': {1: 5.0, 3: 8.0, 4: 16.0, 6: 10.0, 8: 4.0, 10: 20.0, 11: 10.0, 12: 14.0}, 'Z': {1: 7, 3: 2, 4: 7, 6: 0, 8: 4, 10: 3, 11: 10, 12: 3}})", 2035 | "invars": [ 2036 | "df", 2037 | "list", 2038 | "range" 2039 | ], 2040 | "outvar": "out" 2041 | }, 2042 | { 2043 | "inputs": [ 2044 | "pd.DataFrame({'Y': {0: 2.0, 1: 5.0, 2: 5.5, 3: 8.0, 4: 20.0, 5: 4.0, 6: 14.0}, 'X': {0: 5, 1: 7, 2: 4, 3: 2, 4: 3, 5: 10, 6: 3}})" 2045 | ], 2046 | "output": "pd.DataFrame({'Y': {0: 5, 1: 8, 2: 20, 3: 4, 4: 14}, 'X': {0: 7, 1: 2, 2: 3, 3: 10, 4: 3}})", 2047 | "invars": [ 2048 | "df", 2049 | "list", 2050 | "range" 2051 | ], 2052 | "outvar": "out" 2053 | } 2054 | ], 2055 | "solutions": [ 2056 | "out = df[df['Y'].isin(list(range(4,11))+list(range(14,21)))]\n" 2057 | ] 2058 | } 2059 | } 2060 | }, 2061 | "67": { 2062 | "sets": { 2063 | "A": { 2064 | "queries": [ 2065 | { 2066 | "query": "Change the type of column 'colA' of df to 'float32'", 2067 | "user": "u0" 2068 | } 2069 | ], 2070 | "ios": [ 2071 | { 2072 | "inputs": [ 2073 | "pd.DataFrame({'colA': {0: 2, 1: 5, 2: 8, 3: 16, 4: 12, 5: 10, 6: 11, 7: 4, 8: 21, 9: 20, 10: 10, 11: 14}, 'colB': {0: 5, 1: 7, 2: 2, 3: 7, 4: 12, 5: 0, 6: 1, 7: 4, 8: 2, 9: 3, 10: 10, 11: 3}})" 2074 | ], 2075 | "output": "pd.DataFrame({'colA': {0: 2.0, 1: 5.0, 2: 8.0, 3: 16.0, 4: 12.0, 5: 10.0, 6: 11.0, 7: 4.0, 8: 21.0, 9: 20.0, 10: 10.0, 11: 14.0}, 'colB': {0: 5, 1: 7, 2: 2, 3: 7, 4: 12, 5: 0, 6: 1, 7: 4, 8: 2, 9: 3, 10: 10, 11: 3}})", 2076 | "invars": [ 2077 | "df" 2078 | ], 2079 | "outvar": "dfout" 2080 | } 2081 | ], 2082 | "solutions": [ 2083 | "dfout = df.astype({'colA' : 'float32'})\n" 2084 | ] 2085 | } 2086 | } 2087 | } 2088 | } --------------------------------------------------------------------------------