├── .gitignore
├── .travis.yml
├── Makefile
├── README.md
├── github_deploy_key.enc
├── index.rst
├── make.bat
├── requirements.txt
├── scripts
    └── merge-pr.py
└── source
    ├── conf.py
    ├── copyonwrite.rst
    ├── goals.rst
    ├── index.rst
    ├── internal-architecture.rst
    ├── removals.rst
    └── strings.rst


/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | *.py[ocd]
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 3.5
 4 | 
 5 | sudo: false
 6 | 
 7 | env:
 8 |   global:
 9 |     - secure: "L8uXQbGKvVvOGMDIqgzIPj++1nO9M97tv/swZMm6uu3GHEj570p0mSWFxHlN97UQdMt+AAHPIEE/tg7ma7TiOANE0KJaD/6zmlfltvQtJZPngoN3xT3UHsNp/BuX2oDhevMV/LprLYygQ6xi5nHqB6Do9FS62zuZoE4UdFuBe+zec3+RSnW5LhfGQVCme4axHgf/j7vQGVKD7vBMyNqbDsqpKWdEr0d+4OhwLZwDM7I3lirWQ2W3DhRciaweI7G4AJZ4nkTXEfC+kZpGM6H1/Ho2ODqFGzbuudXe0VAnKUZxjWx7yb51+nxylKv7Q2UzOxb88loa91zBr729XGAm/RqXAWipO08MrZMbY0xlH7WoXjucsxUDT7bYhHLx+IQmg+2N1hswIqmpDXrv2ADVomt+R74iiV7UigCx3Ke02axiqdliyAMZcRVAl3Zg/jFXtL4JpFV//X1wJphtJuub7DKYiDr38h4Pr2cuR8fsTt1cF40AejXTJ1/W6X7fz2Q+bsCk/tlNmY4IUGPlXO/49AYC1aTIqwwgvzhYLHTtaGM5j1CWwu7P4eT4VYLSMhwj+MJVzQx9f6ZI/5eiZMI+lDzRpw7hPkgU9d5G/BEGcPDGIdVfhHcOVIZxrKV7p9dhF/nqjF3R8ZSV2yD6yHrFDZyWWJtixiAzaLTgxiyiRMs="
10 | 
11 | script:
12 |   - pip install sphinx doctr ipython pandas
13 |   - make html
14 |   - doctr deploy --built-docs '_build/html' --gh-pages-docs .
15 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 21 | 
 22 | .PHONY: help
 23 | help:
 24 | 	@echo "Please use \`make <target>' where <target> is one of"
 25 | 	@echo "  html       to make standalone HTML files"
 26 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 27 | 	@echo "  singlehtml to make a single large HTML file"
 28 | 	@echo "  pickle     to make pickle files"
 29 | 	@echo "  json       to make JSON files"
 30 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 31 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 32 | 	@echo "  applehelp  to make an Apple Help Book"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 49 | 
 50 | .PHONY: clean
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | .PHONY: html
 55 | html:
 56 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 57 | 	@echo
 58 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 59 | 
 60 | .PHONY: dirhtml
 61 | dirhtml:
 62 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 63 | 	@echo
 64 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 65 | 
 66 | .PHONY: singlehtml
 67 | singlehtml:
 68 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 71 | 
 72 | .PHONY: pickle
 73 | pickle:
 74 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 75 | 	@echo
 76 | 	@echo "Build finished; now you can process the pickle files."
 77 | 
 78 | .PHONY: json
 79 | json:
 80 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the JSON files."
 83 | 
 84 | .PHONY: htmlhelp
 85 | htmlhelp:
 86 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 89 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 90 | 
 91 | .PHONY: qthelp
 92 | qthelp:
 93 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 94 | 	@echo
 95 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 96 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 97 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pandas20DesignDocs.qhcp"
 98 | 	@echo "To view the help file:"
 99 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pandas20DesignDocs.qhc"
100 | 
101 | .PHONY: applehelp
102 | applehelp:
103 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
104 | 	@echo
105 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
106 | 	@echo "N.B. You won't be able to view it unless you put it in" \
107 | 	      "~/Library/Documentation/Help or install it in your application" \
108 | 	      "bundle."
109 | 
110 | .PHONY: devhelp
111 | devhelp:
112 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
113 | 	@echo
114 | 	@echo "Build finished."
115 | 	@echo "To view the help file:"
116 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/pandas20DesignDocs"
117 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pandas20DesignDocs"
118 | 	@echo "# devhelp"
119 | 
120 | .PHONY: epub
121 | epub:
122 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123 | 	@echo
124 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125 | 
126 | .PHONY: latex
127 | latex:
128 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
129 | 	@echo
130 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
131 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
132 | 	      "(use \`make latexpdf' here to do that automatically)."
133 | 
134 | .PHONY: latexpdf
135 | latexpdf:
136 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | 	@echo "Running LaTeX files through pdflatex..."
138 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
139 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 | 
141 | .PHONY: latexpdfja
142 | latexpdfja:
143 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
144 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
145 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
146 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
147 | 
148 | .PHONY: text
149 | text:
150 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
151 | 	@echo
152 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
153 | 
154 | .PHONY: man
155 | man:
156 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
157 | 	@echo
158 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
159 | 
160 | .PHONY: texinfo
161 | texinfo:
162 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
163 | 	@echo
164 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
165 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
166 | 	      "(use \`make info' here to do that automatically)."
167 | 
168 | .PHONY: info
169 | info:
170 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | 	@echo "Running Texinfo files through makeinfo..."
172 | 	make -C $(BUILDDIR)/texinfo info
173 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
174 | 
175 | .PHONY: gettext
176 | gettext:
177 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
178 | 	@echo
179 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
180 | 
181 | .PHONY: changes
182 | changes:
183 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
184 | 	@echo
185 | 	@echo "The overview file is in $(BUILDDIR)/changes."
186 | 
187 | .PHONY: linkcheck
188 | linkcheck:
189 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
190 | 	@echo
191 | 	@echo "Link check complete; look for any errors in the above output " \
192 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
193 | 
194 | .PHONY: doctest
195 | doctest:
196 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
197 | 	@echo "Testing of doctests in the sources finished, look at the " \
198 | 	      "results in $(BUILDDIR)/doctest/output.txt."
199 | 
200 | .PHONY: coverage
201 | coverage:
202 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
203 | 	@echo "Testing of coverage in the sources finished, look at the " \
204 | 	      "results in $(BUILDDIR)/coverage/python.txt."
205 | 
206 | .PHONY: xml
207 | xml:
208 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
209 | 	@echo
210 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
211 | 
212 | .PHONY: pseudoxml
213 | pseudoxml:
214 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
215 | 	@echo
216 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
217 | 
218 | OUTPUTDIR=_build/html
219 | DEPLOYREPOSITORY=pandas2
220 | 
221 | deploy: html
222 | 	if test -d $(OUTPUTDIR); \
223 | 	then echo " (build directory exists)"; \
224 | 	else mkdir -p $(OUTPUTDIR); \
225 | 	fi
226 | 	if test -d $(DEPLOYREPOSITORY); \
227 | 	then echo "  (repository directory exists)"; \
228 | 	else git clone --branch=gh-pages git@github.com:pandas-dev/$(DEPLOYREPOSITORY).git; \
229 | 	fi
230 | 	cd $(DEPLOYREPOSITORY) && git pull
231 | 	rsync -r $(OUTPUTDIR)/* $(DEPLOYREPOSITORY)/
232 | 	cd $(DEPLOYREPOSITORY) && git add . && git commit -m "deploy"
233 | 	cd $(DEPLOYREPOSITORY) && git push origin gh-pages
234 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Design Documents for pandas Project
2 | 
3 | This repository contains an evolving set of documents about the internal
4 | (library developer-facing) and external (user-facing) aspects of the pandas
5 | project.


--------------------------------------------------------------------------------
/github_deploy_key.enc:
--------------------------------------------------------------------------------
1 | gAAAAABXxH5Us5lluuA7Co9E7AzJGyo0cEv7a9PHtfynb4IvdzBDPplpYk4FhCtXoQ9szAfP1KIOwoeM8DiZPg-N-aXf0qY7wKH-E6MDu4ggJTJr9CpJABY4sMnoc1-666iGOOT5RyQ9u2iyWyKWh1rSMPjOoPPt400oap_Hn1rKlzADzD2wdtYLBqs2VoXhIV8yNkMqSCU-j50cyYtl0sX8Zi3K3dmY7_aKyhwuWuB_IWH7rR3OQW82g37jv7OFHHQy5Os8YUfBBa9qAYpX48PfSTw6X8NwruKSczauateEIx-3TCIj8zeIYWbhtnG6k1AbuZt4d4Bypdxg5xD7xynZN-SBjyaS6JEr0YDadbiDAYgZRxMGHIx2Z0FR-rXMXEsM6J0obvXqAJF3DUTxvHH8S2LgY5Gk18ev_jM4mbHL0QfUSTnEw4Ch40rscrcjxBZGyYYdNcPPwOGmJTTn_y8dXUJHEJvx0lKgVpzTYdCz-aDehCGH2djE3QEtYeKFCJKYhvdELf9_0Ld6TJuBoBbgvQ1OdlMNjYUqkfr4QXThV1NBk4eQ95uwMRPVZecPNyXRPa9kSv7-JCiJr0H2rHQfiJjcE54rP4oTemi3sNT4vsjFgpGzOdG2nKWsWcO97TVWqhcg3uQpw1BzADZG5D2K49fEskIQuZAZcSrtvNuYsAqawOfugMf0WmidG8zRxTMVKrjnwmMshW6nYCBG3Q9qF90FJb-zLMR3foTpKSTf6BO5cjJwM9_7MAMuaNGsJMBmTebfIOClLyahRqHMqpNNGGD5gz-Iz5oVU7BXfrXwjcXTEc82fjr14S9iO5XL-I7gfm3BF06dKisGIby8d6ZhVuMsNOW1ZJGOm_vRYL5wzkI0N7jfNsV6ILmAzTE2Wv22Wu5Fd70aHbMOpmj9ubvLQqjjEl3QGsKYCGsaBQMmK7oDVv4OqxgAnbVeqrA2Dz2HS5thHLakuAza7E9UsxoOLih4qwXKvDQ8ApDSojeZM6z9Dg_usb-_87BEVsl4EE4ksHWzZFg_skuNHoNJtuVTTbnzGYXeuuJCFe7vcQUwE2WMSDmt2WhxMTF-U-6RiOifU7LgTmBXXrT8IDsch5Q_i6VMAB8uz9dZ_zRTxIkU7wRtrHYkiAsnIVCeTi12U6d3wOdJZL0dt_YS3N4fHOvi6RT1LgSMaEXZmB8PQKT9qshuSNz9ET2JdcSxUOi6lhA0WZfDB0lzCAOPxYwf1mIzz1l4Naj_K9mOyHznlXo3O7cONdIqioDbKZEANCGvN2Xv0hs-npjsVPprEnIQKzjTY8ju4N6U0GfNhXtoIPu3PLCMkoUIHsr2ZVm45uGGvQHkC_asRvuyR9P7-vdbglBZf8_gUdAn95l80tyWOxRwYq_G4lYLCxUEiwwBQcOqPy46hdbNJue4xJsnvuhPh-tLQm45RKt9YLonpriJPiCiqmlzhv3rfhQoBWy_8XjYTz6RUW8opCf18nrEeuxKCqpwVgf6fSagHMwGpLdN8mWKZwk3RTM3r1guTOPlUNKveGlMY5MUbYV3aHf5TWMnBIB91mij2ET41m5H_kdORTcs5znUIZn66R6TL5nFAeNnB5-o08xgaPyriVNhIYRmVEVg3SRUfdlZq_I0ymDxHm7eFExNZ0n46jg_3D0mkp3dCl5U8MKs6jZd7zonZTdbQg8XSxnDctMg8PMqmoGTsROuxF269sA3lg_s5iEcoHzhBJM8YejULqO4BDkoIi1V1hAfQZ9j5i_7unAQ1zfBvyKdCQGXh_vu5bzgAqbd5mgaZSMdXIbzewXz31zGbUlI1rlZ_gCoA2eK1O7nXNiBadkuiUc9d9IDM2_FPh54K6o9954w-jt4pzhcS-bzH0GRKTHzSGomgGbxelQOHiZO3_6mnFFGEdm47cq5nBk3VFiqKHv4TnKDuH5DdHvl-ab_-fuoQVBVGag-uslAuWomP4RtOSiVU0KuuwxqYrpmTws3d5XsNME7i9wsKu9SVcfw4opD7AF0mXwb6wvlkOC70gaT_yt5yGubBtWqd5eb8xiKZfXYz8KL1hU3hmIo1CzoxSHymjh8CWzgAm83ltmHCFSVwda-bPcUDaWkQe8o3vrb8uCt3eyV0WuBOXI7KevTLHCfhY7JuHazp8hvWoCu7j9OY1Gif4odZY8ilUoFbkftuUnajMUNvvVeubJJ_1H_hT6uz6LqE8014h7SQ5md16lJ2GvRoL4wBJoryrT0q8kNTmAmrUms06Wc_Lkyy9coTyiOsONFXD0cw_7WtbxvGsLZcLANfwRiKbEH9j0epeqOfM4qrSFIIZmbDkYAop6zqbgnHBI4R0a0GY3KIRVEWQa_2r_PX1fP61v8JD3r4S9490F5I5kzyXN0j61c4JqPq8RWqyxjfSEvToQ0IOK7UL9dzxmOKF5bL0jXMrFb0ArPipzFyrRmBohoDByFjPfkArYNHWRZuz6GDfSPP48PaSMVzCch6E-L2qSuF_uuH1Sq0IUoNHkDmCvmmGhVdgcKH0jVSOX3-TvZsGnaeagXCSdwH_yUcEdykhaLsmF7IupRwmri3YnOlupRNIDVoUz_ubSAUTfWZLYfuTaFzrrqrXZCmfpdMi0Z25lPZgWZgNpNZt4XcqsyhFyxYqTwfcGlxVeX1ClzreHDBeg9qxamn2cIz0qKKRSpY62kf4bFZz6u9EsmUKfkE_TIlpBkIWxSrVu34J8W_YyTdLHV7hlNaJWNYy0XTRLg8dXiegyljQuJK2O7H5bPXj4gYmsnehSOMNrS2i8AetD1kiJimhNjtYWB6F1ZVAjYRNo0WT139V1tOIqXBO8gM681wtEu5Qv6Sd2GbwaZPQiqMgTxi-IG7BGsEQ7q3RsMYIDjwWwm7f6Mbdd6DFFIhQit-9hkEvyNH1ELoQy2DMYoXnp0xVncDVNs304b5ugPaFnvCCH0V7Q25CJ2jEHSuNFLjnStlTSUjuzBQE53XbOmWLPdo-zwm-jCymcJxkzJ4UgZKLKDQBafEEeLNOKEzJ2ck8JCwOD_L9viD6E0WdRobfhaiRZAf21ELJgcOXavmf1yJXvaiP5jj7Jwu3TA0_te0UOnLtqv595xxrk3-zSNV8czFiIzDnnAdiHMDg8i24RJiBxqPqCxO42y5Rz7uViknSsC6sXvS0p7NqGDewN8C0JrpPpRfieW8uWEl7wQMKs36aXNQKisTAWIR7yuj_ij0JDE5nyGoxUgedcyGws2JE4OJyGjQqZ_6mXGUxVENzP4kKPA-rG3VgoqaMQk1Rj1yU68YL89Kb3pNKxHT-bKXFT2OMg8w-crn8gz4ODpobXqFkouWVc9JK87E2NKpFHf2EVVK4R1YjqCp6_FKvpGuchiI48e6FXG8A9jcj_43dA_A8pSRuiPD6GklfgqU7qnV6LIDbyWeEvDhBXD_7p1uOmzqVES-4u3F2QYJY-ckvZAQuVpeIdxw03EB5lQhxtcruwvmkCVUk4ML5LiQr-Iz-o2suEjEZC7cU78ySznbGT0slv7PPVjQxTH0OnvGRPRnM_HHJSQL-RZbBC35TRJ4cdHORRLFL79_-PHmsvVNYke8sdWvEgKbEzM1Lxh4rxMrnzJJ5mwifJ1hYTpAkqeevSArWmXWyaTpt6YkWAZzS_rNnYj4jxS-qeFNXlTXlHeWKDzc1Wcs5k2pFOKpYCovd7sxfxqfDVFj6z3e5FyQP4hJ80kqhzx0UW1VOoUT4xSPWfrfoeV2B4bLzXLs3Wl6I03jAdCd9kWAbYa5hGokSIFYYfBALxUKqal4iCbR3BhSj5hHYalnNDtj6zaZTjKP_hgFqQkzMAN2AVQ-2GGXqD5j0V_wYIf0TK06S5WEiZhiqzTVbvwcqSBTw6MW4AGc74G1MkvS1R0q_MSw1skhmwb6eugaycwid0ACE7dqqt53f6Ar0uaauifNZ1nfjVaFR-5amw7Tc2rpoamXh6Kx2ewKt7Evxb8-pKorH8H6Rjsif3sdEgvGLpnUuWyJI1eaBI6ZSfSxmoZ9tsDMKMhPzpKZKXYPWPP2qtBoFsZFm2Nkb_etFpzO-3bEl6KLvaFqM1dNalw1fdgmE8pFmjXFCYHyAFpYoeFOnDusv8neaszRbqlZVSgEAoAaYm56tre5zaWRvL5lf1IaMlTJroSA-qN-tcxjQP_GMZwlpfBvEOM706uCI8MpwXUbCSU6w5nO3vR8uD95TZR3xPNnqIE6ajNpv6rJDTKsS9QGuL5UKWIjVjnsNOvd0VYYvr-jmTfzkfLhMEjWKzkYqnaxOSGTUhjL3PXxxSv6mv1cqHhK1zW3gREVNFFP5X6mEDsNF3C-QQvgIYvPJrYgRhq2DkVAtNBbNXCBolId1iFQSkLmakph7BjZGiJvhe-zC7xSuY7_RJIL6fP4Vw6PZiqHm5G3_0=


--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
 1 | .. pandas 2.0 Design Docs documentation master file, created by
 2 |    sphinx-quickstart on Mon Aug  8 11:48:39 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to pandas 2.0 Design Docs's documentation!
 7 | ==================================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 | 
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 
23 | 


--------------------------------------------------------------------------------
/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 1>NUL 2>NUL
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pandas20DesignDocs.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pandas20DesignDocs.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme
2 | sphinx
3 | 


--------------------------------------------------------------------------------
/scripts/merge-pr.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #
  4 | # Licensed to the Apache Software Foundation (ASF) under one or more
  5 | # contributor license agreements.  See the NOTICE file distributed with
  6 | # this work for additional information regarding copyright ownership.
  7 | # The ASF licenses this file to You under the Apache License, Version 2.0
  8 | # (the "License"); you may not use this file except in compliance with
  9 | # the License.  You may obtain a copy of the License at
 10 | #
 11 | #    http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | # Utility for creating well-formed pull request merges and pushing them to
 21 | # Apache.
 22 | #   usage: ./apache-pr-merge.py    (see config env vars below)
 23 | #
 24 | # Lightly modified from version of this script in incubator-parquet-format
 25 | 
 26 | from __future__ import print_function
 27 | 
 28 | from requests.auth import HTTPBasicAuth
 29 | import requests
 30 | 
 31 | import os
 32 | import six
 33 | import subprocess
 34 | import sys
 35 | import textwrap
 36 | 
 37 | REPO_HOME = '.'
 38 | PROJECT_NAME = 'pandas-design'
 39 | print("REPO_HOME = " + REPO_HOME)
 40 | 
 41 | # Remote name with the PR
 42 | PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "upstream")
 43 | 
 44 | # Remote name where results pushed
 45 | PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "upstream")
 46 | 
 47 | GITHUB_BASE = "https://github.com/pydata/" + PROJECT_NAME + "/pull"
 48 | GITHUB_API_BASE = "https://api.github.com/repos/pydata/" + PROJECT_NAME
 49 | 
 50 | # Prefix added to temporary branches
 51 | BRANCH_PREFIX = "PR_TOOL"
 52 | 
 53 | os.chdir(REPO_HOME)
 54 | 
 55 | auth_required = False
 56 | 
 57 | if auth_required:
 58 |     GITHUB_USERNAME = os.environ['GITHUB_USER']
 59 |     import getpass
 60 |     GITHUB_PASSWORD = getpass.getpass('Enter github.com password for %s:'
 61 |                                       % GITHUB_USERNAME)
 62 | 
 63 |     def get_json_auth(url):
 64 |         auth = HTTPBasicAuth(GITHUB_USERNAME, GITHUB_PASSWORD)
 65 |         req = requests.get(url, auth=auth)
 66 |         return req.json()
 67 | 
 68 |     get_json = get_json_auth
 69 | else:
 70 |     def get_json_no_auth(url):
 71 |         req = requests.get(url)
 72 |         return req.json()
 73 | 
 74 |     get_json = get_json_no_auth
 75 | 
 76 | 
 77 | def fail(msg):
 78 |     print(msg)
 79 |     clean_up()
 80 |     sys.exit(-1)
 81 | 
 82 | 
 83 | def run_cmd(cmd):
 84 |     # py2.6 does not have subprocess.check_output
 85 |     if isinstance(cmd, six.string_types):
 86 |         cmd = cmd.split(' ')
 87 | 
 88 |     popenargs = [cmd]
 89 |     kwargs = {}
 90 | 
 91 |     process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs)
 92 |     output, unused_err = process.communicate()
 93 |     retcode = process.poll()
 94 |     if retcode:
 95 |         cmd = kwargs.get("args")
 96 |         if cmd is None:
 97 |             cmd = popenargs[0]
 98 |         raise subprocess.CalledProcessError(retcode, cmd, output=output)
 99 |     return output
100 | 
101 | 
102 | def continue_maybe(prompt):
103 |     result = raw_input("\n%s (y/n): " % prompt)
104 |     if result.lower() != "y":
105 |         fail("Okay, exiting")
106 | 
107 | 
108 | original_head = run_cmd("git rev-parse HEAD")[:8]
109 | 
110 | 
111 | def clean_up():
112 |     print("Restoring head pointer to %s" % original_head)
113 |     run_cmd("git checkout %s" % original_head)
114 | 
115 |     branches = run_cmd("git branch").replace(" ", "").split("\n")
116 | 
117 |     for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches):
118 |         print("Deleting local branch %s" % branch)
119 |         run_cmd("git branch -D %s" % branch)
120 | 
121 | 
122 | # merge the requested PR and return the merge hash
123 | def merge_pr(pr_num, target_ref):
124 |     pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num)
125 |     target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num,
126 |                                                 target_ref.upper())
127 |     run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num,
128 |                                               pr_branch_name))
129 |     run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref,
130 |                                     target_branch_name))
131 |     run_cmd("git checkout %s" % target_branch_name)
132 | 
133 |     had_conflicts = False
134 |     try:
135 |         run_cmd(['git', 'merge', pr_branch_name, '--squash'])
136 |     except Exception as e:
137 |         msg = ("Error merging: %s\nWould you like to manually fix-up "
138 |                "this merge?" % e)
139 |         continue_maybe(msg)
140 |         msg = ("Okay, please fix any conflicts and 'git add' "
141 |                "conflicting files... Finished?")
142 |         continue_maybe(msg)
143 |         had_conflicts = True
144 | 
145 |     commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
146 |                              '--pretty=format:%an <%ae>']).split("\n")
147 |     distinct_authors = sorted(set(commit_authors),
148 |                               key=lambda x: commit_authors.count(x),
149 |                               reverse=True)
150 |     primary_author = distinct_authors[0]
151 |     commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
152 |                       '--pretty=format:%h [%an] %s']).split("\n\n")
153 | 
154 |     merge_message_flags = []
155 | 
156 |     merge_message_flags += ["-m", title]
157 |     if body is not None:
158 |         merge_message_flags += ["-m", '\n'.join(textwrap.wrap(body))]
159 | 
160 |     authors = "\n".join(["Author: %s" % a for a in distinct_authors])
161 | 
162 |     merge_message_flags += ["-m", authors]
163 | 
164 |     if had_conflicts:
165 |         committer_name = run_cmd("git config --get user.name").strip()
166 |         committer_email = run_cmd("git config --get user.email").strip()
167 |         message = ("This patch had conflicts when merged, "
168 |                    "resolved by\nCommitter: %s <%s>"
169 |                    % (committer_name, committer_email))
170 |         merge_message_flags += ["-m", message]
171 | 
172 |     # The string "Closes #%s" string is required for GitHub to correctly close
173 |     # the PR
174 |     merge_message_flags += [
175 |         "-m",
176 |         "Closes #%s from %s and squashes the following commits:"
177 |         % (pr_num, pr_repo_desc)]
178 |     for c in commits:
179 |         merge_message_flags += ["-m", c]
180 | 
181 |     run_cmd(['git', 'commit', '--author="%s"' % primary_author] +
182 |             merge_message_flags)
183 | 
184 |     continue_maybe("Merge complete (local ref %s). Push to %s?" % (
185 |         target_branch_name, PUSH_REMOTE_NAME))
186 | 
187 |     try:
188 |         run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name,
189 |                                        target_ref))
190 |     except Exception as e:
191 |         clean_up()
192 |         fail("Exception while pushing: %s" % e)
193 | 
194 |     merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8]
195 |     clean_up()
196 |     print("Pull request #%s merged!" % pr_num)
197 |     print("Merge hash: %s" % merge_hash)
198 |     return merge_hash
199 | 
200 | 
201 | def cherry_pick(pr_num, merge_hash, default_branch):
202 |     pick_ref = raw_input("Enter a branch name [%s]: " % default_branch)
203 |     if pick_ref == "":
204 |         pick_ref = default_branch
205 | 
206 |     pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num,
207 |                                              pick_ref.upper())
208 | 
209 |     run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref,
210 |                                     pick_branch_name))
211 |     run_cmd("git checkout %s" % pick_branch_name)
212 |     run_cmd("git cherry-pick -sx %s" % merge_hash)
213 | 
214 |     continue_maybe("Pick complete (local ref %s). Push to %s?" % (
215 |         pick_branch_name, PUSH_REMOTE_NAME))
216 | 
217 |     try:
218 |         run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name,
219 |                                        pick_ref))
220 |     except Exception as e:
221 |         clean_up()
222 |         fail("Exception while pushing: %s" % e)
223 | 
224 |     pick_hash = run_cmd("git rev-parse %s" % pick_branch_name)[:8]
225 |     clean_up()
226 | 
227 |     print("Pull request #%s picked into %s!" % (pr_num, pick_ref))
228 |     print("Pick hash: %s" % pick_hash)
229 |     return pick_ref
230 | 
231 | 
232 | def fix_version_from_branch(branch, versions):
233 |     #  Note: Assumes this is a sorted (newest->oldest) list of un-released
234 |     #  versions
235 |     if branch == "master":
236 |         return versions[0]
237 |     else:
238 |         branch_ver = branch.replace("branch-", "")
239 |         return filter(lambda x: x.name.startswith(branch_ver), versions)[-1]
240 | 
241 | 
242 | # branches = get_json("%s/branches" % GITHUB_API_BASE)
243 | # branch_names = filter(lambda x: x.startswith("branch-"),
244 | #                       [x['name'] for x in branches])
245 | # Assumes branch names can be sorted lexicographically
246 | # latest_branch = sorted(branch_names, reverse=True)[0]
247 | 
248 | pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ")
249 | pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num))
250 | 
251 | url = pr["url"]
252 | title = pr["title"]
253 | body = pr["body"]
254 | target_ref = pr["base"]["ref"]
255 | user_login = pr["user"]["login"]
256 | base_ref = pr["head"]["ref"]
257 | pr_repo_desc = "%s/%s" % (user_login, base_ref)
258 | 
259 | if pr["merged"] is True:
260 |     print("Pull request {0} has already been merged, please backport manually"
261 |           .format(pr_num))
262 |     sys.exit(0)
263 | 
264 | if not bool(pr["mergeable"]):
265 |     msg = ("Pull request {0} is not mergeable in its current form.\n"
266 |            "Continue? (experts only!)".format(pr_num))
267 |     continue_maybe(msg)
268 | 
269 | print("\n=== Pull Request #%s ===" % pr_num)
270 | print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s"
271 |       % (title, pr_repo_desc, target_ref, url))
272 | continue_maybe("Proceed with merging pull request #%s?" % pr_num)
273 | 
274 | merged_refs = [target_ref]
275 | 
276 | merge_hash = merge_pr(pr_num, target_ref)
277 | 


--------------------------------------------------------------------------------
/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # pandas 2.0 Design Docs documentation build configuration file, created by
  5 | # sphinx-quickstart on Mon Aug  8 11:48:39 2016.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import sys
 17 | import os
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | #sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | # needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | 
 33 | extensions = ['IPython.sphinxext.ipython_directive',
 34 |               'IPython.sphinxext.ipython_console_highlighting']
 35 | 
 36 | ipython_mplbackend = None
 37 | 
 38 | # Add any paths that contain templates here, relative to this directory.
 39 | templates_path = ['_templates']
 40 | 
 41 | # The suffix(es) of source filenames.
 42 | # You can specify multiple suffix as a list of string:
 43 | # source_suffix = ['.rst', '.md']
 44 | source_suffix = '.rst'
 45 | 
 46 | # The encoding of source files.
 47 | #source_encoding = 'utf-8-sig'
 48 | 
 49 | # The master toctree document.
 50 | master_doc = 'index'
 51 | 
 52 | # General information about the project.
 53 | project = "pandas 2.0 Design Docs"
 54 | copyright = '2016, pandas Development Team'
 55 | author = 'pandas Development Team'
 56 | 
 57 | # The version info for the project you're documenting, acts as replacement for
 58 | # |version| and |release|, also used in various other places throughout the
 59 | # built documents.
 60 | #
 61 | # The short X.Y version.
 62 | version = '0.1'
 63 | # The full version, including alpha/beta/rc tags.
 64 | release = '0.1'
 65 | 
 66 | # The language for content autogenerated by Sphinx. Refer to documentation
 67 | # for a list of supported languages.
 68 | #
 69 | # This is also used if you do content translation via gettext catalogs.
 70 | # Usually you set "language" from the command line for these cases.
 71 | language = None
 72 | 
 73 | # There are two options for replacing |today|: either, you set today to some
 74 | # non-false value, then it is used:
 75 | #today = ''
 76 | # Else, today_fmt is used as the format for a strftime call.
 77 | #today_fmt = '%B %d, %Y'
 78 | 
 79 | # List of patterns, relative to source directory, that match files and
 80 | # directories to ignore when looking for source files.
 81 | exclude_patterns = ['_build']
 82 | 
 83 | # The reST default role (used for this markup: `text`) to use for all
 84 | # documents.
 85 | #default_role = None
 86 | 
 87 | # If true, '()' will be appended to :func: etc. cross-reference text.
 88 | #add_function_parentheses = True
 89 | 
 90 | # If true, the current module name will be prepended to all description
 91 | # unit titles (such as .. function::).
 92 | #add_module_names = True
 93 | 
 94 | # If true, sectionauthor and moduleauthor directives will be shown in the
 95 | # output. They are ignored by default.
 96 | #show_authors = False
 97 | 
 98 | # The name of the Pygments (syntax highlighting) style to use.
 99 | pygments_style = 'sphinx'
100 | 
101 | # A list of ignored prefixes for module index sorting.
102 | #modindex_common_prefix = []
103 | 
104 | # If true, keep warnings as "system message" paragraphs in the built documents.
105 | #keep_warnings = False
106 | 
107 | # If true, `todo` and `todoList` produce output, else they produce nothing.
108 | todo_include_todos = False
109 | 
110 | 
111 | # -- Options for HTML output ----------------------------------------------
112 | 
113 | # The theme to use for HTML and HTML Help pages.  See the documentation for
114 | # a list of builtin themes.
115 | import sphinx_rtd_theme
116 | 
117 | html_theme = "sphinx_rtd_theme"
118 | 
119 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
120 | 
121 | # Theme options are theme-specific and customize the look and feel of a theme
122 | # further.  For a list of options available for each theme, see the
123 | # documentation.
124 | #html_theme_options = {}
125 | 
126 | # Add any paths that contain custom themes here, relative to this directory.
127 | #html_theme_path = []
128 | 
129 | # The name for this set of Sphinx documents.  If None, it defaults to
130 | # "<project> v<release> documentation".
131 | #html_title = None
132 | 
133 | # A shorter title for the navigation bar.  Default is the same as html_title.
134 | #html_short_title = None
135 | 
136 | # The name of an image file (relative to this directory) to place at the top
137 | # of the sidebar.
138 | #html_logo = None
139 | 
140 | # The name of an image file (within the static path) to use as favicon of the
141 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
142 | # pixels large.
143 | #html_favicon = None
144 | 
145 | # Add any paths that contain custom static files (such as style sheets) here,
146 | # relative to this directory. They are copied after the builtin static files,
147 | # so a file named "default.css" will overwrite the builtin "default.css".
148 | html_static_path = []
149 | 
150 | # Add any extra paths that contain custom files (such as robots.txt or
151 | # .htaccess) here, relative to this directory. These files are copied
152 | # directly to the root of the documentation.
153 | #html_extra_path = []
154 | 
155 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
156 | # using the given strftime format.
157 | #html_last_updated_fmt = '%b %d, %Y'
158 | 
159 | # If true, SmartyPants will be used to convert quotes and dashes to
160 | # typographically correct entities.
161 | #html_use_smartypants = True
162 | 
163 | # Custom sidebar templates, maps document names to template names.
164 | #html_sidebars = {}
165 | 
166 | # Additional templates that should be rendered to pages, maps page names to
167 | # template names.
168 | #html_additional_pages = {}
169 | 
170 | # If false, no module index is generated.
171 | #html_domain_indices = True
172 | 
173 | # If false, no index is generated.
174 | #html_use_index = True
175 | 
176 | # If true, the index is split into individual pages for each letter.
177 | #html_split_index = False
178 | 
179 | # If true, links to the reST sources are added to the pages.
180 | #html_show_sourcelink = True
181 | 
182 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
183 | #html_show_sphinx = True
184 | 
185 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
186 | #html_show_copyright = True
187 | 
188 | # If true, an OpenSearch description file will be output, and all pages will
189 | # contain a <link> tag referring to it.  The value of this option must be the
190 | # base URL from which the finished HTML is served.
191 | #html_use_opensearch = ''
192 | 
193 | # This is the file name suffix for HTML files (e.g. ".xhtml").
194 | #html_file_suffix = None
195 | 
196 | # Language to be used for generating the HTML full-text search index.
197 | # Sphinx supports the following languages:
198 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
199 | #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
200 | #html_search_language = 'en'
201 | 
202 | # A dictionary with options for the search language support, empty by default.
203 | # Now only 'ja' uses this config value
204 | #html_search_options = {'type': 'default'}
205 | 
206 | # The name of a javascript file (relative to the configuration directory) that
207 | # implements a search results scorer. If empty, the default will be used.
208 | #html_search_scorer = 'scorer.js'
209 | 
210 | # Output file base name for HTML help builder.
211 | htmlhelp_basename = 'pandas20DesignDocsdoc'
212 | 
213 | # -- Options for LaTeX output ---------------------------------------------
214 | 
215 | latex_elements = {
216 | # The paper size ('letterpaper' or 'a4paper').
217 | #'papersize': 'letterpaper',
218 | 
219 | # The font size ('10pt', '11pt' or '12pt').
220 | #'pointsize': '10pt',
221 | 
222 | # Additional stuff for the LaTeX preamble.
223 | #'preamble': '',
224 | 
225 | # Latex figure (float) alignment
226 | #'figure_align': 'htbp',
227 | }
228 | 
229 | # Grouping the document tree into LaTeX files. List of tuples
230 | # (source start file, target name, title,
231 | #  author, documentclass [howto, manual, or own class]).
232 | latex_documents = [
233 |     (master_doc, 'pandas20DesignDocs.tex', 'pandas 2.0 Design Docs Documentation',
234 |      'pandas Development Team', 'manual'),
235 | ]
236 | 
237 | # The name of an image file (relative to this directory) to place at the top of
238 | # the title page.
239 | #latex_logo = None
240 | 
241 | # For "manual" documents, if this is true, then toplevel headings are parts,
242 | # not chapters.
243 | #latex_use_parts = False
244 | 
245 | # If true, show page references after internal links.
246 | #latex_show_pagerefs = False
247 | 
248 | # If true, show URL addresses after external links.
249 | #latex_show_urls = False
250 | 
251 | # Documents to append as an appendix to all manuals.
252 | #latex_appendices = []
253 | 
254 | # If false, no module index is generated.
255 | #latex_domain_indices = True
256 | 
257 | 
258 | # -- Options for manual page output ---------------------------------------
259 | 
260 | # One entry per manual page. List of tuples
261 | # (source start file, name, description, authors, manual section).
262 | man_pages = [
263 |     (master_doc, 'pandas20designdocs', 'pandas 2.0 Design Docs Documentation',
264 |      [author], 1)
265 | ]
266 | 
267 | # If true, show URL addresses after external links.
268 | #man_show_urls = False
269 | 
270 | 
271 | # -- Options for Texinfo output -------------------------------------------
272 | 
273 | # Grouping the document tree into Texinfo files. List of tuples
274 | # (source start file, target name, title, author,
275 | #  dir menu entry, description, category)
276 | texinfo_documents = [
277 |     (master_doc, 'pandas20DesignDocs', 'pandas 2.0 Design Docs Documentation',
278 |      author, 'pandas20DesignDocs', 'One line description of project.',
279 |      'Miscellaneous'),
280 | ]
281 | 
282 | # Documents to append as an appendix to all manuals.
283 | #texinfo_appendices = []
284 | 
285 | # If false, no module index is generated.
286 | #texinfo_domain_indices = True
287 | 
288 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
289 | #texinfo_show_urls = 'footnote'
290 | 
291 | # If true, do not generate a @detailmenu in the "Top" node's menu.
292 | #texinfo_no_detailmenu = False
293 | 


--------------------------------------------------------------------------------
/source/copyonwrite.rst:
--------------------------------------------------------------------------------
1 | .. _copyonwrite:
2 | 
3 | ==================================
4 |  View semantics and Copy-On-Write
5 | ==================================
6 | 


--------------------------------------------------------------------------------
/source/goals.rst:
--------------------------------------------------------------------------------
  1 | .. _goals:
  2 | 
  3 | =======================
  4 |  Goals and Motivations
  5 | =======================
  6 | 
  7 | The pandas codebase is now over 8 years old, having grown to over 200,000 lines
  8 | of code from its original ~10,000 LOC in the original 0.1 open source release
  9 | in January 2010.
 10 | 
 11 | At a high level, the "pandas 2.0" effort is based on a number of observations:
 12 | 
 13 | * The pandas 0.x series of releases have consisted with huge amounts of
 14 |   iterative improvements to the library along with some major new features, bug
 15 |   fixes, and improved documentation. There have also been a series of
 16 |   deprecations, API changes, and other evolutions of pandas's API to account
 17 |   for suboptimal design choices (for example: the ``.ix`` operator) made in the
 18 |   early days of the project (2010 to 2012).
 19 | * The unification of Series and DataFrame internals to be based on a common
 20 |   ``NDFrame`` base class and "block manager" data structure (originally created
 21 |   by me in 2011, and heroically driven forward to its modern form by Jeff
 22 |   Reback), while introducing many benefits to pandas, has come to be viewed as
 23 |   a long-term source of technical debt and code complexity.
 24 | * pandas's ability to support an increasingly broad set of use cases has been
 25 |   significantly constrained (as will be examined in detail in these documents)
 26 |   by its tight coupling to NumPy and therefore subject to various limitations
 27 |   in NumPy.
 28 | * Making significant functional additions (particularly filling gaps in NumPy)
 29 |   to pandas, particularly new data types, has grown increasingly complex with
 30 |   very obvious accumulations of technical debt.
 31 | * pandas is being used increasingly for very large datasets on machines with
 32 |   many cores and large amounts of RAM (100s of gigabytes to terabytes). It
 33 |   would be nice to be able to better utilize these larger, beefier systems
 34 |   within a single Python process.
 35 | * pandas is being used increasingly as a computational building block of some
 36 |   larger system, such as Dask or Apache Spark. We should consider reducing the
 37 |   overhead for making data accessible to pandas (i.e. via memory-mapping or
 38 |   other low-overhead memory sharing).
 39 | * Rough edges in pandas's implementation (e.g. its handling of missing data
 40 |   across data types) are being exposed to users.
 41 | 
 42 | These documents are largely concerned with pandas's internal design, which is
 43 | mostly invisible to average users. Advanced users of pandas are generally
 44 | familiar with some of these internal details, particular around performance and
 45 | memory use, and so the degree to which users are impacted will vary quite a
 46 | lot.
 47 | 
 48 | Goals
 49 | =====
 50 | 
 51 | Some high levels goals of the pandas 2.0 plan include the following:
 52 | 
 53 | * Fixing long-standing limitations or inconsistencies in missing data: null
 54 |   values in integer and boolean data, and a more consistent notion of null /
 55 |   NA.
 56 | * Improved performance and utilization of multicore systems
 57 | * Better user control / visibility of memory usage (which can be opaque and
 58 |   difficult to conttrol)
 59 | * Clearer semantics around non-NumPy data types, and permitting new pandas-only
 60 |   data types to be added
 61 | * Exposing a "libpandas" C/C++ API to other Python library developers: the
 62 |   internals of Series and DataFrame are only weakly accessible in other
 63 |   developers' native code. This has been a limitation for scikit-learn and
 64 |   other projects requiring C or Cython-level access to pandas object data.
 65 | * Removal of deprecated functionality
 66 | 
 67 | Non-goals / FAQ
 68 | ===============
 69 | 
 70 | As this will be a quite nuanced discussion, especially for those not intimately
 71 | familiar with pandas's implementation details, I wanted to speak to a couple of
 72 | commonly-asked questions in brief:
 73 | 
 74 | ````
 75 | 
 76 | 1. **Will this work make it harder to use pandas with NumPy, scikit-learn,
 77 |    statsmodels, SciPy, or other libraries that depend on NumPy
 78 |    interoperability?**
 79 |   * We are not planning on it. Data that is representable without memory
 80 |     copying or conversion in NumPy arrays will continue to be 100%
 81 |     interoperable.
 82 |   * Data containing missing (NA) values may require explicit conversion where
 83 |     it is not currently required. For example: integer or boolean type arrays
 84 |     with missing data. I trust this will be seen as a positive development.
 85 |   * If anything, more performant and more precise data semantics in pandas will
 86 |     generally make production code using a downstream library like scikit-learn
 87 |     more dependable and future-proof.
 88 | 
 89 | ````
 90 | 
 91 | 2. **By decoupling from NumPy, it sounds like you are reimplementing NumPy or
 92 |    adding a new data type system**
 93 | 
 94 |    * Simply put: no. But it's more complicated than that because of the
 95 |      numerous interpretations of "type system".
 96 | 
 97 |    * pandas already contains a large amount (10s of KLOCs) of custom
 98 |      computational code (see, for example,
 99 |      `<https://github.com/pydata/pandas/tree/master/pandas/src>`_) that implements
100 |      functionality not present in NumPy.
101 | 
102 |    * pandas already features its own (what I will describe as a) "logical type
103 |      system", including things like custom data types (such as that of
104 |      ``pandas.Categorical``), pandas-specific missing data representation, and
105 |      implicit type casting (e.g. integer to float on introduction of missing
106 |      data). Unfortunately, these logical data types are somewhat weakly
107 |      expressed, and the mix of NumPy dtype objects and custom pandas types is
108 |      problematic for many internal (implementation) and external (user API)
109 |      reasons. I will examine in detail the difference between **physical
110 |      types** (i.e. NumPy's dtypes) and **logical types** (i.e. what pandas
111 |      currently has, implicitly).
112 | 
113 | ````
114 | 
115 | 3. **Shouldn't you try to accomplish your goals by contributing work to NumPy
116 |    instead of investing major work in pandas's internals?**
117 | 
118 |    * In my opinion, this is a "false dichotomy"; i.e. these things are not
119 |      mutually exclusive.
120 | 
121 |    * Yes, we should define, scope, and if possible help implement improvements
122 |      to NumPy that make sense. As NumPy serves a significantly larger and more
123 |      diverse set of users, major changes to the NumPy C codebase must be
124 |      approached more conservatively.
125 | 
126 |    * It is unclear that pandas's body of domain-specific data handling and
127 |      computational code is entirely "in scope" for NumPy. Some technical
128 |      details, such as our categorical or datetime data semantics, "group by"
129 |      functionality, relational algebra (joins), etc., may be ideal for pandas
130 |      but not necessarily ideal for a general user of NumPy. My opinion is that
131 |      functionality from NumPy we wish to use in pandas should "pass through" to
132 |      the user unmodified, but we must retain the flexibility to work "outside
133 |      the box" (implement things not found in NumPy) without adding technical
134 |      debt or user API complexity.
135 | 
136 | ````
137 | 
138 | 4. **API changes / breaks are thought to be bad; don't you have a
139 |    responsibility to maintain backwards compatibility for users that heavily
140 |    depend on pandas?**
141 | 
142 |    * It's true that APIs should not be broken or changed, and as such should be
143 |      approached with extreme caution.
144 | 
145 |    * The goal of the pandas 2.0 initiative is to only make "good" API breaks
146 |      that yield a net benefit that can be easily demonstrated. As an example:
147 |      adding native missing data support to integer and boolean data (without
148 |      casting to another physical storage type) may break user code that has
149 |      knowledge of the "rough edge" (the behavior that we are fixing). As these
150 |      changes will mostly affect advanced pandas users, I expect they will be
151 |      welcomed.
152 | 
153 |    * Any major API change or break will be documented and justified to assist
154 |      with code migration.
155 | 
156 |    * As soon as we are able, we will post binary development artifacts for the
157 |      pandas 2.0 development branch to get early feedback from heavy pandas
158 |      users to understand the impact of changes and how we can better help the
159 |      existing user base.
160 | 
161 |    * Some users will find that a certain piece of code has been working "by
162 |      accident" (i.e. relying upon undocumented behavior). This kind of breakage
163 |      is already a routine occurrence unfortunately.
164 | 
165 | Summary
166 | =======
167 | 
168 | Overall, the goal of the pandas 2.0 project is to yield a faster, more cleanly
169 | architected, and more future-proof library that is a drop-in replacement for
170 | 90-95% of pandas user code. There will be API / code breakages, but the intent
171 | of any code breakage will almost always be to fix something that has been
172 | "wrong" or inconsistent. Many advanced users will have worked around some of
173 | these rough edges, and so their workarounds may either need to be removed or
174 | changed to accommodate the new (and hopefully it can be agreed in each case:
175 | better) semantics.
176 | 


--------------------------------------------------------------------------------
/source/index.rst:
--------------------------------------------------------------------------------
 1 | pandas 2.0 Design Documents
 2 | ===========================
 3 | 
 4 | .. note::
 5 | 
 6 |   These documents are a work in progress. Please see ongoing discussions on
 7 |   http://github.com/pydata/pandas-design
 8 | 
 9 | These are a set of documents, based on discussions started in December 2015, to
10 | assist with discussions around changes to Python pandas's internal design
11 | intended to better accommodate the evolving needs of the growing Python data
12 | userbase and to help ensure that pandas remains a relevant and important
13 | project in the future.
14 | 
15 | We also will use this place to collect ideas for things to remove (such as
16 | deprecated features, but possibly other things) from the library that don't
17 | necessarily depend on the internal implementation.
18 | 
19 | .. toctree::
20 |    :maxdepth: 3
21 | 
22 |    goals
23 |    internal-architecture
24 |    strings
25 |    copyonwrite
26 |    removals
27 | 
28 | .. Indices and tables
29 | .. ==================
30 | 
31 | .. * :ref:`genindex`
32 | .. * :ref:`modindex`
33 | .. * :ref:`search`
34 | 


--------------------------------------------------------------------------------
/source/internal-architecture.rst:
--------------------------------------------------------------------------------
  1 | .. _internal-architecture:
  2 | 
  3 | .. ipython:: python
  4 |    :suppress:
  5 | 
  6 |    import numpy as np
  7 |    import pandas as pd
  8 |    np.set_printoptions(precision=4, suppress=True)
  9 |    pd.options.display.max_rows = 100
 10 | 
 11 | ===================================
 12 |  Internals: Data structure changes
 13 | ===================================
 14 | 
 15 | Logical types and Physical Storage Decoupling
 16 | =============================================
 17 | 
 18 | Since this is the most important, but perhaps also most controversial, change
 19 | (in my opinion) to pandas, I'm going to go over it in great detail. I think the
 20 | hardest part is coming up with clear language and definitions for concepts so
 21 | that we can communicate effectively. For example the term "data type" is vague
 22 | and may mean different things to different people.
 23 | 
 24 | A motivating example
 25 | ~~~~~~~~~~~~~~~~~~~~
 26 | 
 27 | Before digging too much into the technical details and problems/solutions,
 28 | let's look at some code examples. It is not unusual to find code like this in
 29 | pandas's internals:
 30 | 
 31 | .. code-block:: python
 32 | 
 33 |     def create_from_value(value, index, dtype):
 34 |         # return a new empty value suitable for the dtype
 35 | 
 36 |         if is_datetimetz(dtype):
 37 |             subarr = DatetimeIndex([value] * len(index), dtype=dtype)
 38 |         elif is_categorical_dtype(dtype):
 39 |             subarr = Categorical([value] * len(index))
 40 |         else:
 41 |             if not isinstance(dtype, (np.dtype, type(np.dtype))):
 42 |                 dtype = dtype.dtype
 43 |             subarr = np.empty(len(index), dtype=dtype)
 44 |             subarr.fill(value)
 45 | 
 46 | or
 47 | 
 48 | .. code-block:: python
 49 | 
 50 |    if is_categorical_dtype(dtype):
 51 |        upcast_cls = 'category'
 52 |    elif is_datetimetz(dtype):
 53 |        upcast_cls = 'datetimetz'
 54 |    elif issubclass(dtype.type, np.bool_):
 55 |        upcast_cls = 'bool'
 56 |    elif issubclass(dtype.type, np.object_):
 57 |        upcast_cls = 'object'
 58 |    elif is_datetime64_dtype(dtype):
 59 |        upcast_cls = 'datetime'
 60 |    elif is_timedelta64_dtype(dtype):
 61 |        upcast_cls = 'timedelta'
 62 |    else:
 63 |        upcast_cls = 'float'
 64 | 
 65 | I've cherry-picked one of a number of places where this type of datatype-based
 66 | branching happens.
 67 | 
 68 | The primary reason for this complexity is that pandas is using both NumPy's
 69 | dtype objects (which describe *physical storage*) as well as its own custom
 70 | data type objects as a proxy for pandas's *semantic logical types*.
 71 | 
 72 | Let's step back for a second and come up with clear language to steer the
 73 | discussion.
 74 | 
 75 | Some definitions
 76 | ~~~~~~~~~~~~~~~~
 77 | 
 78 | Here is my attempt at definitions of some of the key terms:
 79 | 
 80 | * **Metadata**: data that describes other data (such as its in-memory layout)
 81 | 
 82 | * **Semantics**: The meaning / abstract interpretation of something. We often
 83 |   discuss the semantics (meaning) of computer programs (i.e. what they do,
 84 |   fundamentally) without touching upon low level details like machine
 85 |   representation, programming languages, compilers, operating systems, etc.
 86 | 
 87 | * **Physical data (or storage) types**: these are metadata objects which
 88 |   provide a description of the precise structure of a piece of data in memory.
 89 | 
 90 |   * In NumPy, the ``numpy.dtype`` object (aka ``PyArray_Descr`` in the C API)
 91 |     is metadata describing a single cell / value in an array. Combined with the
 92 |     ``shape`` and ``strides`` attributes of the ``ndarray`` object, you have
 93 |     enough information to perform O(1) random access on any cell in an
 94 |     ``ndarray`` and to assign these values to a C type (or, in the case, of
 95 |     structured dtypes, assign to a packed C struct).
 96 | 
 97 |   * This may or may not include a physical representation of NULL or missing
 98 |     data (for example: nullable float64 might be a physical type indicating a
 99 |     normal float64 array along with a bitmap of null/not-null indicators).
100 | 
101 | * **Logical data type**: metadata which describes the semantic content of a
102 |   single value in an array or other collection of values. Depending on the
103 |   logical type, it may map 1-to-1 to a physical type or not at all. Here are
104 |   some examples:
105 | 
106 |   * The ``double`` or ``float64`` type may be viewed both as a logical type as
107 |     well as a physical type (a 1-to-1 correspondence).
108 | 
109 |   * pandas's ``category`` dtype contains its own auxiliary array of category
110 |     values (for example, the distinct strings collected from a string
111 |     array). Based on the number of categories, the category ``codes`` (which
112 |     reference the categories array) are stored in the smallest possible integer
113 |     physical type (from ``int8`` to ``int64``, depending whether the data type
114 |     can accommodate the codes). For example, if there are 50 codes, the data is
115 |     represented in ``int8`` storage. For 1000 codes, it would be ``int16``.
116 | 
117 |   * Another example: timestamps may be physically stored in ``int64``
118 |     storage, and these values are interpreted in the context of a particular
119 |     time unit or resolution (e.g. nanoseconds, milliseconds, seconds).
120 | 
121 | In general, new logical types may be formed either by placing new semantics on
122 | top of a single physical data type or some composition of physical or logical
123 | types. For example: you could have a categorical type (a logical construct
124 | consisting of multiple arrays of data) whose categories are some other logical
125 | type.
126 | 
127 | For historical reasons, **pandas never developed a clear or clean semantic
128 | separation in its user API between logical and physical data types**. Also, the
129 | addition of new, pandas-only "synthetic" dtypes that are unknown to NumPy (like
130 | categorical, datetimetz, etc.) has expanded this conflation considerably. If
131 | you also consider pandas's custom missing / NULL data behavior, the addition of
132 | ad hoc missing data semantics to a physical NumPy data type created, by the
133 | definitions above, a logical data type (call it ``object[nullable]`` for an
134 | object array) without ever explicitly saying so.
135 | 
136 | You might be thinking, "Good job, Wes. You really messed that up!" I'd be
137 | inclined to agree with you now in retrospect, but back in 2011 pandas was not
138 | the super popular project that it is today, and we were truly riding on NumPy's
139 | coat tails. So the extent to which NumPy concepts and APIs were used explicitly
140 | in pandas made the library easier to adopt. Now in 2016, this feels
141 | anachronistic / outdated.
142 | 
143 | High-level logical type proposal
144 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
145 | 
146 | As we have been discussing periodically on the pandas-dev mailing list and
147 | GitHub, I am proposing that we start to unravel our current mess by defining
148 | pandas-specific metadata objects that model the current semantics / behavior of
149 | the project. What does this mean, exactly?
150 | 
151 | * Each NumPy dtype object will map 1-to-1 to an equivalent ``pandas.DataType``
152 |   object.
153 | * Existing pandas "extension dtypes" (like ``CategoricalDtype`` and
154 |   ``DatetimeTZDtype``), which have been designed to mimic ``numpy.dtype``, will
155 |   become logical type subclasses of ``pandas.DataType`` like every other type
156 |   in pandas.
157 | 
158 | Since pandas is about assisting with data manipulation and analysis, at some
159 | point you must invoke functions that are specialized to the specific physical
160 | memory representation of your data. For example, pandas has its own
161 | implementations of ``ndarray.take`` that are used internally for arrays of
162 | positive integers that may contain NULL / NA values (which are represented as
163 | -1 -- search the codebase for implementations of ``take_1d``).
164 | 
165 | The major goals of introducing a logical type abstraction are the follows:
166 | 
167 | * Simplifying "dynamic dispatch": invoking the right functions or choosing the
168 |   right code branches based on the data type.
169 | * Enabling pandas to decouple both its internal semantics and physical storage
170 |   from NumPy's metadata and APIs. Note that this is already happening with
171 |   categorical types, since a particular instance of ``CategoricalDtype`` may
172 |   physically be stored in one of 4 NumPy data types.
173 | 
174 | Physical storage decoupling
175 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
176 | 
177 | By separating pandas data from the presumption of using a particular physical
178 | ``numpy.dtype`` internally, we can:
179 | 
180 | * Begin to better protect users from NumPy data semantics (which are frequently
181 |   different from pandas's!) leaking through to the pandas user API. This can
182 |   enable us to address long-standing inconsistencies or "rough edges" in pandas
183 |   that have persisted due to our tight semantic coupling to NumPy.
184 | 
185 | * We can consider adding new data structures to pandas, either custom to pandas
186 |   or provided by 3rd-party libraries, that add new functionality alongside the
187 |   existing code (presuming NumPy physical storage). As one concrete example,
188 |   discussed in more detail below, we can enable missing data in integer pandas
189 |   data by forming a composite data structure consisting of a NumPy array plus a
190 |   bitmap marking the null / not-null values.
191 | 
192 |   - It may end up being a requirement that 3rd party data structures will need
193 |     to have a C or C++ API to be used in pandas.
194 | 
195 | * We can start to think about improved behavior around data ownership (like
196 |   copy-on-write) which may yield many benefits. I will write a dedicated
197 |   section about this.
198 | 
199 | Note that neither of these points implies that we are trying to use NumPy
200 | less. We already have large amounts of code that implement algorithms similar
201 | to those found in NumPy (e.g. ``pandas.unique`` or the implementation of
202 | ``Series.sum``), but taking into account pandas's missing data representation,
203 | etc. Internally, we can use NumPy when its computational semantics match those
204 | we've chosen for pandas, and elsewhere we can invoke pandas-specific code.
205 | 
206 | A major concern here based on these ideas is **preserving NumPy
207 | interoperability**, so I'll examine this topic in some detail next.
208 | 
209 | Correspondence between logical and physical types
210 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
211 | 
212 | * **Floating point numbers**
213 | 
214 |   - Logical: ``Float16/32/64``
215 |   - Physical: ``numpy.float16/32/64``, with ``NaN`` for null (for backwards
216 |     compatibility)
217 | 
218 | * **Signed Integers**
219 | 
220 |   - Logical: ``Int8/16/32/64``
221 |   - Physical: ``numpy.int8/16/32/64`` array plus nullness bitmap
222 | 
223 | * **Unsigned Integers**
224 | 
225 |   - Logical: ``UInt8/16/32/64``
226 |   - Physical: ``numpy.uint8/16/32/64`` array plus nullness bitmap
227 | 
228 | * **Boolean**
229 | 
230 |   - Logical: ``Boolean``
231 |   - Physical: ``np.bool_`` (a.k.a. ``np.uint8``) array plus nullness bitmap. We
232 |     may also explore bit storage (versus bytes).
233 | 
234 | * **Categorical**
235 | 
236 |   - Logical: ``Categorical[T]``, where ``T`` is any other logical type
237 |   - Physical: this type is a composition of a ``Int8`` through ``Int64``
238 |     (depending on the cardinality of the categories) plus the categories
239 |     array. These have the same physical representation as
240 | 
241 | * **String and Binary**
242 | 
243 |   - Logical: ``String`` and ``Binary``
244 |   - Physical: Dictionary-encoded representation for UTF-8 and general binary
245 |     data as described in the `string section <strings>`.
246 | 
247 | * **Timestamp**
248 | 
249 |   - Logical: ``Timestamp[unit]``, where unit is the resolution. Nanoseconds can
250 |     continue to be the default unit for now
251 |   - Physical: ``numpy.int64``, with ``INT64_MIN`` as the null value.
252 | 
253 | * **Timedelta**
254 | 
255 |   - Logical: ``Timedelta[unit]``, where unit is the resolution
256 |   - Physical: ``numpy.int64``, with ``INT64_MIN`` as the null value.
257 | 
258 | * **Period**
259 | 
260 |   - Logical: ``Period[unit]``, where unit is the resolution
261 |   - Physical: ``numpy.int64``, with ``INT64_MIN`` as the null value.
262 | 
263 | * **Interval**
264 | 
265 |   - Logical: ``Interval``
266 |   - Physical: two arrays of ``Timestamp[U]`` -- these may need to be forced to
267 |     both be the same resolution
268 | 
269 | * **Python objects** (catch-all for other data types)
270 | 
271 |   - Logical: ``Object``
272 |   - Physical: ``numpy.object_`` array, with None for null values (perhaps with
273 |     ``np.NaN`` also for backwards compatibility)
274 | 
275 | * **Complex numbers**
276 | 
277 |   - Logical: ``Complex64/128``
278 |   - Physical: ``numpy.complex64/128``, with ``NaN`` for null (for backwards
279 |     compatibility)
280 | 
281 | Some notes on these:
282 | 
283 | - While a pandas (logical) type may map onto one or more physical
284 |   representations, in general NumPy types will map directly onto a pandas
285 |   type. Thus, existing code involving ``numpy.dtype``-like objects (such as
286 |   ``'f8'`` or ``numpy.float64``) will continue to work.
287 | 
288 | Preserving NumPy interoperability
289 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
290 | 
291 | Some of types of intended interoperability between NumPy and pandas are as
292 | follows:
293 | 
294 | * **Access to internal data**: Users can obtain the a ``numpy.ndarray``
295 |   (possibly a view depending on the internal block structure, more on this
296 |   soon) in constant time and without copying the actual data. This has a couple
297 |   other implications
298 | 
299 |   * Changes made to this array will be reflected in the source pandas object.
300 |   * If you write C extension code (possibly in Cython) and respect pandas's
301 |     missing data details, you can invoke certain kinds of fast custom code on
302 |     pandas data (but it's somewhat inflexible -- see the latest discussion on
303 |     adding a native code API to pandas).
304 | 
305 | * **Ufuncs**: NumPy ufuncs (like ``np.sqrt`` or ``np.log``) can be invoked on
306 |   pandas objects like Series and DataFrame
307 | 
308 | * **Array protocol**: ``numpy.asarray`` will always yield some array, even if
309 |   it discards metadata or has to create a new array. For example ``asarray``
310 |   invoked on ``pandas.Categorical`` yields a reconstructed array (rather than
311 |   either the categories or codes internal arrays)
312 | 
313 | * **Interchangeability**: Many NumPy methods designed to work on subclasses (or
314 |   duck-typed classes) of ``ndarray`` may be used. For example ``numpy.sum`` may
315 |   be used on a Series even though it does not invoke NumPy's internal C sum
316 |   algorithm. This means that a Series may be used as an interchangeable
317 |   argument in a large set of functions that only know about NumPy arrays.
318 | 
319 | By and large, I think much of this can be preserved, but there will be some API
320 | breakage. In particular, interchangeability is not something we can or should
321 | guarantee.
322 | 
323 | If we add more composite data structures (Categorical can be thought of as
324 | one existing composite data structure) to pandas or alternate non-NumPy data
325 | structures, there will be cases where the semantic information in a Series
326 | cannot be adequately represented in a NumPy array.
327 | 
328 | As one example, if we add pandas-only missing data support to integer and
329 | boolean data (a long requested feature), calling ``np.asarray`` on such data
330 | may not have well-defined behavior. As present, pandas is implicitly converting
331 | these types to ``float64`` (see more below), which isn't too great. A decision
332 | does not need to be made now, but the benefits of solving this long-standing
333 | issue may merit breaking ``asarray`` as long as we provide an explicit way to
334 | obtain the original casted ``float64`` NumPy array (with ``NaN`` for NULL/NA
335 | values)
336 | 
337 | For pandas data that does not step outside NumPy's semantic realm, we can
338 | continue to provide zero-copy views in many cases.
339 | 
340 | Missing data consistency
341 | ========================
342 | 
343 | Once the physical memory representation has been effectively decoupled from the
344 | user API, we can consider various approaches to implementing missing data in a
345 | consistent way for every logical pandas data type.
346 | 
347 | To motivate this, let's look at some integer data:
348 | 
349 | .. ipython:: python
350 | 
351 |    s = pd.Series([1, 2, 3, 4, 5])
352 |    s
353 |    s.dtype
354 |    s.values
355 | 
356 | If we assign a ``numpy.NaN``, see what happens:
357 | 
358 | .. ipython:: python
359 | 
360 |    s[2] = np.NaN
361 |    s
362 |    s.dtype
363 |    s.values
364 | 
365 | The story for boolean data is similar:
366 | 
367 | .. ipython:: python
368 | 
369 |    s = pd.Series([True, False, True])
370 |    s.dtype
371 |    s[2] = np.NaN
372 |    s.dtype
373 |    s.values
374 | 
375 | This implicit behavior appears in many scenarios, such as:
376 | 
377 | * Loading data from any source: databases, CSV files, R data files, etc.
378 | * Joins or reindexing operations introducing missing data
379 | * Pivot / reshape operations
380 | * Time series resampling
381 | * Certain types of GroupBy operations
382 | 
383 | A proposed solution
384 | ~~~~~~~~~~~~~~~~~~~
385 | 
386 | My proposal for introducing missing data into any NumPy type outside of
387 | floating point (which uses ``NaN`` for now) and Python object (which uses
388 | ``None`` or ``NaN`` interchangeably) is to **allocate and manage an internal
389 | bitmap** (which the user never sees). This has numerous benefits:
390 | 
391 | * 1 byte of memory overhead for each 8 values
392 | * Bitmaps can propagate their nulls in C through bitwise ``&`` or ``|``
393 |   operations, which are inexpensive.
394 | * Getting and setting bits on modern hardware is CPU-inexpensive. For
395 |   single-pass array operations (like groupbys) on large arrays this may also
396 |   result in better CPU cache utilization (fewer main-memory reads of the
397 |   bitmap).
398 | * Hardware and SIMD "popcount" intrinsics (which can operate on 64-128 bits at
399 |   a time) can be used to count bits and skip null-handling on segments of data
400 |   containing no nulls.
401 | 
402 | Notably, this is the way that PostgreSQL handles null values. For example, we
403 | might have:
404 | 
405 | .. code-block:: text
406 | 
407 |    [0, 1, 2, NA, NA, 5, 6, NA]
408 | 
409 |         i: 7 6 5 4 3 2 1 0
410 |    bitmap: 0 1 1 0 0 1 1 1
411 | 
412 | Here, the convention of 1 for "not null" (a la PostgreSQL) and
413 | least-significant bit ordering (LSB "bit endianness") is being used.
414 | 
415 | Under the new regime, users could simply write:
416 | 
417 | .. code-block:: python
418 | 
419 |    s[2] = pandas.NA
420 | 
421 | and the data type would be unmodified. It may be necessary to write something
422 | akin to:
423 | 
424 | .. code-block:: python
425 | 
426 |    s.to_numpy(dtype=np.float64, na_rep=np.nan)
427 | 
428 | and that would emulate the current behavior. Attempts to use ``__array__` (for
429 | example: calling ``np.sqrt`` on the data) would result in an error since we
430 | will likely want to refuse to make a guess as for what casting behavior the
431 | user desires.
432 | 
433 | Tradeoffs
434 | ~~~~~~~~~
435 | 
436 | One potential downside of the bitmap approach is that missing data implemented
437 | outside of NumPy's domain will need to be explicitly converted if it is needed
438 | in another library that only knows about NumPy. I argue that this is better
439 | than the current implicit conversion which could yield data loss (for integers
440 | falling outside the exact representable range for ``float64``).
441 | 
442 | Removal of BlockManager / new DataFrame internals
443 | =================================================
444 | 
445 | Deep inside the belly pandas objects, there is a data structure called
446 | ``BlockManager`` which, at a high level, is responsible for managing the
447 | physical arrays where the data inside a Series or DataFrame is looked
448 | after (also Panel / PanelND structure, even though these are on their way to
449 | deprecation).
450 | 
451 | While this data structure has served pandas well since its birth 5 years ago
452 | (Summer 2011), it has a number of problems that make its removal and
453 | replacement with something else an attractive option.
454 | 
455 | The goal of this section is to explain what the BlockManager is, why it exists
456 | at all, and why we should consider removing it.
457 | 
458 | What is ``BlockManager`` and why does it exist?
459 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
460 | 
461 | The reason that ``BlockManager`` exists at all goes back to some ancient pandas
462 | history. Originally, the data in ``pandas.DataFrame`` was stored in a Python
463 | ``dict`` object. If you pull up pandas 0.1 or 0.2, you will see this.
464 | 
465 | Since the business logic of pandas's internals was originally implemented in
466 | pure Python, as it is still is (but much larger / more complex), there was a
467 | marked performance difference between column-oriented operations and
468 | row-oriented operations. The reason for this is not really a memory layout
469 | issue (NumPy users know about how contiguous memory access produces much better
470 | performance) so much as a reliance on NumPy's two-dimensional array operations
471 | for carrying out pandas's computations. So, to do anything row oriented on an
472 | all-numeric DataFrame, pandas would concatenate all of the columns together
473 | (using ``numpy.vstack`` or ``numpy.hstack``) then use array broadcasting or
474 | methods like ``ndarray.sum`` (combined with ``np.isnan`` to mind missing data)
475 | to carry out certain operations.
476 | 
477 | 1. pandas's early users (i.e. AQR employees) beseeched me to address this
478 |    performance issue. Thus ``DataMatrix`` was created, a roughly API-equivalent
479 |    object whose internal storage was a 2D NumPy array, intended to be of a
480 |    homogeneous type (e.g. ``numpy.float64``). The downside of this was that if
481 |    you inserted a string column, everything would become ``numpy.object_``
482 |    dtype. Users did not like that.
483 | 
484 | 2. It had become apparent that the dichotomy between DataFrame and DataMatrix
485 |    (and when to use each) was harming pandas's adoption and confusing users. So
486 |    I set about creating a hybrid data structure that had "the best of both
487 |    worlds".
488 | 
489 | 3. The idea was that the BlockManager would track collections of NumPy arrays
490 |    having the same dtype, particular as columns were inserted or removed
491 |    (i.e. the *building* phase of the DataFrame's lifetime).
492 | 
493 | 4. When you would invoke an operation that benefited from a single
494 |    *consolidated* 2-dimensional ndarray of say ``float64`` dtype (for example:
495 |    using ``reindex`` or performing a row-oriented operation), the BlockManager
496 |    would glue together its accumulated pieces to create a single 2D ndarray of
497 |    each data type. This is called **consolidation** in the codebase.
498 | 
499 | 5. Since in practice, heterogeneous DataFrames had different types interspersed
500 |    amongst their columns, the BlockManager maintains a mapping between the
501 |    absolute column position and the relative position within the type-specific
502 |    2D "block".
503 | 
504 | 6. Over time, the BlockManager has been generalized for the 1 through N
505 |    dimensional cases, not just the 2D case, so that even Series has a lean
506 |    "SingleBlockManager" internally.
507 | 
508 | Another motivation for the BlockManager was to be able to create DataFrame
509 | objects with zero copy from two-dimensional NumPy arrays. See Jeff Reback's
510 | `exposition on this
511 | <http://nbviewer.jupyter.org/github/jreback/PandasTalks/blob/master/performance/may_2016/1.%20storage.ipynb>`_.
512 | 
513 | Drawbacks of BlockManager
514 | ~~~~~~~~~~~~~~~~~~~~~~~~~
515 | 
516 | While this data structure has enabled pandas to make it this far in life, it
517 | has a number of drawbacks (not a complete list):
518 | 
519 | 1. **Code complexity**: this has manifested in a number of ways (and probably
520 |    others that I'm missing)
521 | 
522 |    * Making some of the most important algorithms in pandas fast, like joins
523 |      and reshape operations, requires carefully constructing the precise block
524 |      structure of the output DataFrame so that no further copying or
525 |      consolidation will take place.
526 | 
527 |    * Adding new custom data types to DataFrame and not losing their metadata
528 |      (e.g. time zones or categories) has had a sort of "fan out" effect
529 |      touching numerous parts of the BlockManager internals.
530 | 
531 | 2. **Loss of user visibility into memory use and memory layout**: With large
532 |    data sets, some "naively" constructed DataFrame objects (e.g. from a dict of
533 |    ndarrays) can produce a memory-doubling effect that may cause out-of-memory
534 |    errors. Also, consolidated blocks can (depending on the version of pandas)
535 |    result in columns having strided / non-contiguous data, resulting in
536 |    degraded performance in column-oriented operations.
537 | 
538 | 3. **Unavoidable consolidation**: Fairly common operations, like ``read_csv``,
539 |    may require a consolidation step after completion, which for large data may
540 |    result in performance or memory overhead (similar to the above bullet
541 |    point).
542 | 
543 | 4. **Microperformance issues / indexing slowness**: since a DataFrame can be a
544 |    sort of many-layered onion, many common pandas operations may weave through
545 |    dozens of different functions navigating the structure of the object and
546 |    producing the appropriate output. I will talk more about microperformance
547 |    later.
548 | 
549 | Replacing BlockManager without weakening pandas
550 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
551 | 
552 | Our goal in replacing BlockManager would be to achieve:
553 | 
554 | * Substantially simpler code
555 | * Easier extensibility with new logical types
556 | * Performance on par (or better) the current implementation
557 | * Better user control over memory use and layout
558 | * Improved microperformance
559 | 
560 | I believe we can do this, but it's will require a significant inversion of the
561 | internal code architecture to involve a more native code and less interpreted
562 | Python. For example, it will be difficult or impossible to achieve comparable
563 | performance in row-oriented operations (on consolidated DataFrame objects) with
564 | pure Python code.
565 | 
566 | In the next section, I will start making my case for creating a "native core"
567 | library where we can assemble the low level data structures, logical types, and
568 | memory management for pandas. Additionally, we would want to port much of
569 | pandas's helper Cython code to live inside this library and operate directly on
570 | the internal data structures rather than being orchestrated from the Python
571 | interpreter level.
572 | 
573 | Building "libpandas" in C++11/14 for lowest level implementation tier
574 | =====================================================================
575 | 
576 | Currently, pandas architecturally is structured as follows:
577 | 
578 | * Pure Python implementation of internal data structure business logic
579 | * Algorithms in Cython (more often) or C (less often) to accelerate
580 |   computationally-intensive algorithms
581 | 
582 | While it's overall made pandas easier to develop and maintain internally
583 | (perhaps increasingly less so over time!), this has had a number of drawbacks
584 | as we've discussed. I mentioned microperformance above, so about that:
585 | 
586 | Microperformance
587 | ~~~~~~~~~~~~~~~~
588 | 
589 | Microperformance (operations taking 1 microsecond to 1 millisecond) has
590 | suffered considerably as pandas's internals have expanded to accommodate new
591 | use cases. Fairly simple operations, from indexing to summary statistics, may
592 | pass through multiple layers of scaffolding before hitting the lowest tier of
593 | computations. Let's take for example:
594 | 
595 | .. ipython:: python
596 | 
597 |    s = pd.Series(np.random.randn(100))
598 |    s.sum()
599 | 
600 | Profiling ``s.sum()`` with ``%prun`` in IPython, I am seeing 116 function
601 | calls (pandas 0.18.1). Let's look at the microperformance:
602 | 
603 | .. code-block:: text
604 | 
605 |    In [14]: timeit s.sum()
606 |    10000 loops, best of 3: 31.7 µs per loop
607 | 
608 |    In [15]: v = s.values
609 | 
610 |    In [16]: timeit v.sum()
611 |    1000000 loops, best of 3: 1.07 µs per loop
612 | 
613 | While a slightly contrived example, the internal data structures and function
614 | dispatch machinery add 30 microseconds of overhead. That may not be a
615 | compelling number, but such a method called 1 million times has an additional
616 | 30 seconds of overhead. When you consider microperformance in the context of
617 | custom ``groupby`` operations, for example, this may not be so unrealistic.
618 | 
619 | C or C++ (C++11, to be specific)?
620 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
621 | 
622 | At the risk of instigating a religious programming language debate, pandas's
623 | use of Cython in many places is very C++-like:
624 | 
625 | * Generic programming through manual code generation (now using tempita)
626 |   instead of templates
627 | * Auxiliary types and data structures as ``cdef class`` extension types
628 | * Relying on Python's reference counting for garbage collection and cleanup
629 |   after exceptions are raised. The "blend C and Cython" style has aided
630 |   developer productivity.
631 | 
632 | I argue that judicious and responsible use of modern C++ (and following a
633 | reasonable style guide like `Google's guide
634 | <http://google.github.io/styleguide/cppguide.html>`_, or some slight variation)
635 | will enable us to:
636 | 
637 | * Simplify our existing Cython codebase by using templates (and very limited,
638 |   template metaprogramming)
639 | 
640 | * Easier generic programming / inlining of data-type specific logic at compile
641 |   time.
642 | 
643 | * Use RAII (exception-safe allocation) and smart pointers (``std::unique_ptr``
644 |   and ``std::shared_ptr``) to simplify memory management
645 | 
646 | * Can use the STL (e.g. ``std::unordered_map`` for some hash tables) for
647 |   standard data structures, or incorporate other C++ data structures (e.g. from
648 |   Google open source libraries) that have been more optimized for certain use
649 |   cases.
650 | 
651 | * Define performant C++ classes modeling the current internals, with various
652 |   mechanisms for code reuse or type-specific dynamic dispatch (i.e. through
653 |   template classes, CRTP, or simply virtual functions).
654 | 
655 | * Use C++11 standard library concurrency tools to more easily create concurrent
656 |   / multithreaded implementations of common pandas algorithms.
657 | 
658 | By pushing down much of the business logic into C++ (with use of the Python and
659 | NumPy C API where relevant), we'll be able to achieve macroperformance on par
660 | or better than the current BlockManager-based implementation and handily better
661 | microperformance in indexing and simple analytics.
662 | 
663 | ``pandas.Array`` types
664 | ~~~~~~~~~~~~~~~~~~~~~~
665 | 
666 | My gut feeling is that we would want to create relatively simple container
667 | classes having a common ``pandas::Array`` base type in C++, each of which
668 | models a particular logical type. Each array type would have a corresponding
669 | logical type implementation, in the vein of:
670 | 
671 | .. code-block:: c++
672 | 
673 |    class Array {
674 |      // public API omitted
675 |      private:
676 |        std::shared_ptr<DataType> type_;
677 |    }
678 | 
679 |    class CategoricalType : public DataType {
680 |      // implementation
681 | 
682 |      private:
683 |        std::shared_ptr<Array> categories_;
684 |    };
685 | 
686 |    class CategoricalArray : public Array {
687 |      public:
688 |        std::shared_ptr<Array> codes() const;
689 |        std::shared_ptr<Array> categories() const;
690 |        // rest of implementation omitted
691 |    };
692 | 
693 | An array containing a NumPy array will invoke ``Py_DECREF`` in its destructor,
694 | so that after construction one can proceed largely with C++ programming
695 | semantics without much need for manual memory management.
696 | 
697 | These Array types would be wrapped and exposed to pandas developers (probably
698 | in Cython).
699 | 
700 | We would also want to provide a public Python API to the ``pandas.Array`` type,
701 | which would be the object returned by ``Series.values``. For example, at
702 | present we have:
703 | 
704 | .. ipython:: python
705 | 
706 |    s = pd.Series([1,2] * 2)
707 |    s
708 |    s.values
709 |    s2 = s.astype('category')
710 |    s2.values
711 |    type(s2.values)
712 | 
713 | By introducing a consistent base array type, we can eliminate the current
714 | dichotomy between pandas's extension dtypes and built-in NumPy physical dtypes.
715 | 
716 | We could also define a limited public API for interacting with these data
717 | containers directly.
718 | 
719 | Index types
720 | ~~~~~~~~~~~
721 | 
722 | Like pandas's current code structure, Index types would be composed from the
723 | Array types and some additional data structures (hash tables) for lookups and
724 | other index operations. These can be similarly exposed to the world via Cython
725 | (and wrapped in a convenient pandas.Index class).
726 | 
727 | ``pandas.Table``
728 | ~~~~~~~~~~~~~~~~
729 | 
730 | My recommendation is to decommission the BlockManager in favor of a much
731 | simpler low-level Table class, which operates more similarly to an R data.frame
732 | (e.g. no row index). This would look something like
733 | 
734 | .. code-block:: c++
735 | 
736 |    class Table {
737 |      public:
738 |        std::shared_ptr<Array> GetColumn(int i);
739 |        void SetColumn(int i, const std::shared_ptr<Array>& arr);
740 | 
741 |        // rest of public API omitted
742 |      private:
743 |        // Column index, possibly not necessary
744 |        std::shared_ptr<Index> columns_;
745 | 
746 |        // List of arrays
747 |        std::vector<std::shared_ptr<Array>> data_;
748 |    };
749 | 
750 | Operators and dynamic dispatch
751 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
752 | 
753 | Under this proposed class structure, it may not make sense to add operations as
754 | class methods. We could possibly do something like:
755 | 
756 | .. code-block:: c++
757 | 
758 |    #include "pandas/dispatch.h"
759 | 
760 |    // other includes omitted
761 | 
762 |    using ArrayRef = std::shared_ptr<Array>;
763 | 
764 |    template <typename U, typename V>
765 |    inline ArrayRef TakeImpl(U, V) {
766 |      // Implementation omitted
767 |    }
768 | 
769 |    ArrayRef Take(ArrayRef values, ArrayRef indices) {
770 |      return Dispatch<TakeImpl>(values, indices);
771 |    }
772 | 
773 | Here, the Dispatch template would generate the matrix of logical type
774 | combinations, some of which might throw a not implemented exception.
775 | 
776 | There's other approaches to dealing with runtime dispatch that don't feature
777 | too much overhead.
778 | 
779 | Memory accounting
780 | ~~~~~~~~~~~~~~~~~
781 | 
782 | If pandas's internals are encapsulated in C++ classes inside the libpandas core
783 | library, we could atomically track all memory allocations and deallocations to
784 | produce a precise accounting of the number of bytes that pandas has currently
785 | allocated (that are not opaque, so Python objects would only include their
786 | ``PyObject**`` array footprint).
787 | 
788 | Development toolchain
789 | ~~~~~~~~~~~~~~~~~~~~~
790 | 
791 | Introducing C++11 to pandas's development toolchain will add quite a bit of
792 | complexity for developers, especially compared with pandas's current Cython and
793 | C codebase which basically builds out of the box for most people. It would be
794 | better for cross-platform support to use CMake than something else (distutils
795 | doesn't have adequate support for C++).
796 | 
797 | Logical types for strings and possibly other non-numeric data
798 | =============================================================
799 | 
800 | I believe that frequently-occurring data types, such as UTF8 strings, are
801 | important enough to deserve a dedicated logical pandas data type. This will
802 | enable us both to enforce tighter API semantics (i.e. attempts to assign a
803 | non-string into string data will be a ``TypeError``) and improved performance
804 | and memory use under the hood. I will devote an entire section to talking about
805 | strings.
806 | 
807 | In general, I would be supportive of making Python object (``numpy.object_``
808 | dtype) arrays the solution only for mixed-type arrays and data types for which
809 | pandas has no native handling.
810 | 
811 | 3rd-party native API (i.e. Cython and C / C++)
812 | ==============================================
813 | 
814 | Developers of 3rd-party projects (myself included) have often expressed a
815 | desire to be able to inspect, construct, or otherwise manipulate pandas objects
816 | (if even in a limited fashion) in compiled code (Cython, C, or C++).
817 | 
818 | Per the discussion of libpandas and a native core, I would propose the
819 | following:
820 | 
821 | * Define public-facing ``.pxd`` files that allow developers to use ``cimport``
822 |   and get access to pandas's internal extension types.
823 | * Define factory function that enable fully formed Series and DataFrame objects
824 |   to be constructed either by Cython API calls or potentially also C++
825 |   libpandas API calls.
826 | * Provide Cython APIs for 3rd-party developers to obtain pointers to access the
827 |   underlying C++ objects contained in the wrapper Python objects
828 | 


--------------------------------------------------------------------------------
/source/removals.rst:
--------------------------------------------------------------------------------
 1 | .. _removals:
 2 | 
 3 | ===========================
 4 |  Other miscellaneous ideas
 5 | ===========================
 6 | 
 7 | Dropping Python 2 support
 8 | =========================
 9 | 
10 | With Python 2.7 reaching its supported end-of-life in 2020, like some other
11 | Python projects (e.g. IPython / Jupyter) we should seriously contemplate making
12 | pandas 2.0 only support Python 3.5 and higher. In addition to lowering the
13 | development burden at both the C API and pure Python level, we can also finally
14 | look to take advantage of features (things like ``asyncio``, maybe?) only
15 | available in Python 3.
16 | 
17 | Deprecated code to remove
18 | =========================
19 | 
20 | * ``.ix`` indexing entirely
21 | * ``Panel`` and ``PanelND`` classes
22 | * Plotting?
23 | 
24 | Other ideas
25 | ===========
26 | 
27 | Here's a collection of other miscellaneous ideas that don't necessarily fit
28 | elsewhere in these documents.
29 | 
30 | Column statistics
31 | ~~~~~~~~~~~~~~~~~
32 | 
33 | In quite a few pandas algorithms, there are characteristics of the data that
34 | are very useful to know, such as:
35 | 
36 | * **Monotonicity**: for comparable data (e.g. numbers), is the data sorted /
37 |   strictly increasing? In time series, this permits sorting steps to be
38 |   skipped.
39 | 
40 | * **Null count**: for data not containing any nulls, the null handling path in
41 |   some algorithms can be skipped entirely
42 | 
43 | * **Uniqueness**: used in indexes, and can be helpful elsewhere
44 | 
45 | Strided arrays: more trouble than they are worth?
46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
47 | 
48 | Per the general discussion around changing DataFrame's internals to contain a
49 | list / ``std::vector`` of arrays internally, for me this begs the question of
50 | the benefits of continuing to accommodate strided one-dimensional data.
51 | 
52 | Some pros for eliminating strided data completely:
53 | 
54 | * Guaranteeing contiguous memory internally will yield more consistent and
55 |   predictable performance.
56 | 
57 | * Not needing to consider a stride different from 1 means simpler low-level
58 |   array indexing code (e.g. you can work with plain C arrays). The stride is a
59 |   complexity / overhead that leaks to every algorithm that iterates over an
60 |   array.
61 | 
62 | * You avoid strange situations where a strided view holds onto a base ndarray
63 |   reference to a much larger array
64 | 
65 | * **Example:** `<https://github.com/wesm/feather/issues/97>`_. Here, the
66 |   internal orientation (column-major vs. row-major) is not clear to the user.
67 | 
68 | Some cons:
69 | 
70 | * It would not be possible to perform zero-copy computations on a strided NumPy
71 |   array
72 | 
73 | * Relatedly, initializing a Series or DataFrame from strided memory would
74 |   require allocating an equivalent amount of contiguous memory for each of the
75 |   columns.
76 | 
77 | For me, at least, I don't find the cons compelling enough to warrant the code
78 | complexity tradeoff.
79 | 
80 | Enforcing immutability in GroupBy functions
81 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
82 | 
83 | Side effects from ``groupby`` operations have been a common source of issues or
84 | unintuitive behavior for users.
85 | 
86 | Handling of sparse data structures
87 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
88 | 
89 | It's possible that the sparse types could become first class logical types,
90 | e.g. ``Sparse[T]``, eliminating the ``Sparse*`` classes.
91 | 


--------------------------------------------------------------------------------
/source/strings.rst:
--------------------------------------------------------------------------------
  1 | .. _strings:
  2 | 
  3 | .. ipython:: python
  4 |    :suppress:
  5 | 
  6 |    import numpy as np
  7 |    import pandas as pd
  8 |    np.set_printoptions(precision=4, suppress=True)
  9 |    pd.options.display.max_rows = 100
 10 | 
 11 | =============================================
 12 |  Internals: Enhanced string / UTF-8 handling
 13 | =============================================
 14 | 
 15 | There are some things we can do to make pandas use less memory and perform
 16 | computations significantly faster on string data.
 17 | 
 18 | Current string problems
 19 | =======================
 20 | 
 21 | pandas offers support for columns containing strings (ASCII or Unicode) on a
 22 | somewhat ad hoc basis.
 23 | 
 24 | * Strings are stored in NumPy arrays of ``PyObject*`` / ``numpy.object_``
 25 |   dtype. This has several problems
 26 | 
 27 |   * Computations (e.g. ``groupby`` operations) typically utilize a code path
 28 |     for generic Python objects. For example comparisons or hashing goes through
 29 |     the ``PyObject_*`` C API functions. In addition to harming multithreading
 30 |     due to GIL contention (you must acquire the GIL to use these functions),
 31 |     these can also be significantly slower than algorithms that operate on
 32 |     ``const char*``, potentially taking advantage of hardware optimizations.
 33 | 
 34 |   * String arrays often feature many copies of or references to the same
 35 |     PyString. Thus, some algorithms may perform redundant computation. Some
 36 |     parts of pandas, like ``pandas.read_csv``, make an effort to deduplicate
 37 |     strings to free memory and accelerate computations (e.g. if you do ``x ==
 38 |     y``, and ``x`` and ``y`` are references to the same ``PyObject*``, Python
 39 |     skips comparing their internal data).
 40 | 
 41 |     * Note that this is somewhat mitigated by using ``pandas.Categorical``, but
 42 |       this is not the default storage mechanism. More on this below.
 43 | 
 44 |   * Using ``PyString`` objects and ``PyObject*`` NumPy storage adds non-trivial
 45 |     overhead (52 bytes in Python 3, slightly less in Python 2, see `this
 46 |     exposition <http://www.gahcep.com/python-internals-pyobject/>`_ for a
 47 |     deeper drive) to each value.
 48 | 
 49 | Possible solution: new non-NumPy string memory layout
 50 | =====================================================
 51 | 
 52 | My proposed solution to the string conundrum is the following:
 53 | 
 54 | * Create a custom string array container type suitable for use in a
 55 |   ``pandas.Array``, and a ``pandas.string`` logical data type.
 56 | * Require that all strings be encoded as UTF-8.
 57 | * By default, represent all string arrays internally as dictionary-encoded
 58 |   a.k.a. categorical. Thus, we will typically only ever have 1 copy of any
 59 |   given string in an array.
 60 | * Store the actual string data in a packed UTF-8 buffer. I have seen this in a
 61 |   number of places, but notably it's the way that `Apache Arrow implements
 62 |   variable-length collections
 63 |   <https://github.com/apache/arrow/blob/master/format/Layout.md#list-type>`_.
 64 | 
 65 | Here is one possible C struct-like layout of this container:
 66 | 
 67 | .. code-block:: c++
 68 | 
 69 |    typedef struct {
 70 |      /* Category / dictionary indices into the string data */
 71 |      uint32_t* indices;
 72 | 
 73 |      /* The encoded string lengths */
 74 |      uint32_t* offsets;
 75 | 
 76 |      /* The packed UTF-8 data */
 77 |      const char* data;
 78 | 
 79 |      /* For nullness */
 80 |      uint8_t* bitmap;
 81 |    } string_array_t;
 82 | 
 83 | Here's an example of what the data would look like:
 84 | 
 85 | .. code-block:: text
 86 | 
 87 |    actual data : ['foo', 'bars', 'foo', null, 'bars']
 88 | 
 89 |    indices: [0, 1, 0, 0, 1]
 90 | 
 91 |                                     bitmap[0]
 92 |    bitmap (read right-to-left): 0 0 0 1 0 1 1 1 |
 93 | 
 94 |    offsets: [0, 3, 7]
 95 |    data: ['f', 'o', 'o', 'b', 'a', 'r', 's']
 96 | 
 97 | Some benefits of this approach include:
 98 | 
 99 | * Much better data locality for low-cardinality categorical data
100 | * 8.125 bytes (8 bytes plus 1 bit) of memory overhead per value versus 33 to 52
101 |   bytes (the current).
102 | * The data is already categorical: cast to ``category`` dtype can be perform
103 |   very cheaply and without duplicating the underlying string memory buffer
104 | * Computations like ``groupby`` on dictionary-encoded strings will be as
105 |   performant as those on Categorical currently are.  performant
106 | 
107 | Some drawbacks
108 | 
109 | * This memory layout is best used as an immutable representation. Mutating
110 |   slots here becomes more complex. Whether single value assignments or put /
111 |   array-assignment may likely require constructing a new ``data`` buffer
112 |   (either by ``realloc`` or some other copying mechanism). Without a compaction
113 |   / "garbage collection" step on this buffer it will be possible to have "dead"
114 |   memory inside it (for example, if you did ``arr[:] = 'a-new-string-value'``,
115 |   all the existing values would be orphaned).
116 | 
117 |   * Some systems have addressed this issue by storing all string data in a
118 |     "global string hash table". This is something we could explore, but it
119 |     would add quite a bit of complexity to implement and may not be worthwhile
120 |     at this time.
121 | 
122 | * Indexing into this data structure to obtain a single Python object will
123 |   probably want to call ``PyUnicode_FromStringAndSize`` to construct a string
124 |   (Python 3, therefore Unicode). This requires a memory allocation, whereas it
125 |   currently only has to do a ``Py_INCREF``.
126 | 
127 | * Many of pandas's existing algorithms assuming Python objects would need to be
128 |   specialized to take advantage of this new memory layout. This is both a pro
129 |   and a con as it will most likely yield significantly better performance.
130 | 
131 | Concerns / problems
132 | ===================
133 | 
134 | Preserving code that assumes PyString objects
135 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
136 | 
137 | Any alternate UTF-8 string in-memory representation should necessarily be able
138 | to yield Python string objects using ``PyUnicode_FromStringAndSize``. Thus,
139 | code like this could continue to work:
140 | 
141 | .. ipython:: python
142 | 
143 |    s = pd.Series(["como estás?"])
144 |    s.map(lambda x: x.upper())
145 | 
146 | One trade-off is that creating the temporary Python strings is potentially
147 | costly. This could be mitigated for Python ``str`` methods (optimized
148 | array-oriented code path under the hood), but for arbitrary functions you would
149 | have to pay.
150 | 
151 | Accommodating Non-UTF-8 data
152 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
153 | 
154 | Some pandas users will have code that involves various non-UTF-8 Python string
155 | types:
156 | 
157 | * Native unicode: Py_UCS1, Py_UCS2, Py_UCS4
158 | * Non-UTF-8 PyBytes
159 | 
160 | .. ipython:: python
161 | 
162 |    s = pd.Series(["como estás?"])
163 |    s
164 |    s.str.encode('latin-1')
165 |    s.str.encode('latin-1').str.decode('latin-1')
166 | 
167 | Such data could arise from reading a CSV file in a non-UTF-8 encoding, and you
168 | did not indicate the encoding to ``pandas.read_csv``.
169 | 
170 | My proposed solution to this is to provide a ``binary`` logical type having the
171 | same physical memory layout as UTF-8 strings, with only the metadata being
172 | different. So you would have the following semantics:
173 | 
174 | * ``latin1_s = s.encode('latin-1')``: this yields a ``binary`` view and
175 |   allocates new memory.
176 | * ``utf8_s = s.encode('utf-8')``: this is a no-op, but yields a ``binary`` view.
177 | * ``s2 = utf8_s.decode('utf-8')``: this requires using a Unicode codec to
178 |   validate indicated codec.
179 | 
180 | Indexing and slicing
181 | ~~~~~~~~~~~~~~~~~~~~
182 | 
183 | Storing strings as UTF-8 bytes means that things like this become more
184 | complicated:
185 | 
186 | .. ipython:: python
187 | 
188 |    s = pd.Series(["estás está estáis"])
189 |    s.str[9]
190 |    s.str[6:10]
191 | 
192 | Since UTF-8 is a variable length encoding, finding the logical character by
193 | position will need to make use of the Python C API (expensive, requires
194 | creating new Python objects) or a 3rd party library. We could make use of the
195 | `ICU C++ Libraries <http://site.icu-project.org/>`_ to implement this.
196 | 


--------------------------------------------------------------------------------