├── .gitignore
├── Power.ipynb
├── Randomization inference in networks.ipynb
├── Randomization inference.ipynb
├── Using covariates.ipynb
├── cai_data
├── cai.adjacency.RData
└── cai.main.tsv
├── prep_data.R
└── pseudo_facebook_small.tsv
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
59 | # iPython
60 | .ipynb_checkpoints
61 |
62 | pseudo_facebook.tsv
63 |
--------------------------------------------------------------------------------
/Power.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Consequences of low power"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [
17 | {
18 | "name": "stderr",
19 | "output_type": "stream",
20 | "text": [
21 | "Loading required package: iterators\n",
22 | "Loading required package: parallel\n"
23 | ]
24 | }
25 | ],
26 | "source": [
27 | "library(foreach)\n",
28 | "library(doMC)\n",
29 | "registerDoMC(4)\n",
30 | "library(ggplot2)\n",
31 | "theme_set(theme_bw())\n",
32 | "library(repr)\n",
33 | "options(repr.plot.width=6, repr.plot.height=4)"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {
40 | "collapsed": false
41 | },
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/plain": [
46 | "\n",
47 | " Two-sample t test power calculation \n",
48 | "\n",
49 | " n = 50\n",
50 | " delta = 0.1\n",
51 | " sd = 1\n",
52 | " sig.level = 0.05\n",
53 | " power = 0.0715\n",
54 | " alternative = two.sided\n",
55 | "\n",
56 | "NOTE: n is number in *each* group\n"
57 | ]
58 | },
59 | "execution_count": 2,
60 | "metadata": {},
61 | "output_type": "execute_result"
62 | }
63 | ],
64 | "source": [
65 | "\n",
66 | "power.t.test(n = 50, delta = .1, sig.level = .05)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 3,
72 | "metadata": {
73 | "collapsed": false
74 | },
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/html": [
79 | "
\n",
80 | " | est | p |
\n",
81 | "\n",
82 | "\t1 | -0.000216 | 0.999 |
\n",
83 | "\n",
84 | "
\n"
85 | ],
86 | "text/latex": [
87 | "\\begin{tabular}{r|ll}\n",
88 | " & est & p\\\\\n",
89 | "\\hline\n",
90 | "\t1 & -0.000216 & 0.999\\\\\n",
91 | "\\end{tabular}\n"
92 | ],
93 | "text/plain": [
94 | " est p\n",
95 | "1 -0.000216 0.999"
96 | ]
97 | },
98 | "execution_count": 3,
99 | "metadata": {},
100 | "output_type": "execute_result"
101 | }
102 | ],
103 | "source": [
104 | "do.sim <- function(n, delta) {\n",
105 | " z <- rep(0:1, each = n)\n",
106 | " y <- z * delta + rnorm(n * 2)\n",
107 | " \n",
108 | " r <- t.test(y ~ z)\n",
109 | " data.frame(est = unname(r$estimate[2] - r$estimate[1]), p = r$p.value)\n",
110 | "}\n",
111 | "do.sim(50, .1)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 4,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [],
121 | "source": [
122 | "results <- foreach(i = 1:1e4, .combine = rbind) %dopar% do.sim(50, .1)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 5,
128 | "metadata": {
129 | "collapsed": false
130 | },
131 | "outputs": [
132 | {
133 | "data": {
134 | "text/plain": [
135 | " est p \n",
136 | " Min. :-0.694 Min. :0.000 \n",
137 | " 1st Qu.:-0.036 1st Qu.:0.199 \n",
138 | " Median : 0.100 Median :0.447 \n",
139 | " Mean : 0.099 Mean :0.461 \n",
140 | " 3rd Qu.: 0.231 3rd Qu.:0.711 \n",
141 | " Max. : 0.980 Max. :1.000 "
142 | ]
143 | },
144 | "execution_count": 5,
145 | "metadata": {},
146 | "output_type": "execute_result"
147 | }
148 | ],
149 | "source": [
150 | "summary(results)"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 6,
156 | "metadata": {
157 | "collapsed": false
158 | },
159 | "outputs": [
160 | {
161 | "name": "stderr",
162 | "output_type": "stream",
163 | "text": [
164 | "stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.\n"
165 | ]
166 | },
167 | {
168 | "data": {
169 | "image/png": "",
170 | "image/svg+xml": [
171 | "\n",
172 | "\n"
532 | ],
533 | "text/plain": [
534 | "plot without title"
535 | ]
536 | },
537 | "metadata": {
538 | "image/svg+xml": {
539 | "isolated": true
540 | }
541 | },
542 | "output_type": "display_data"
543 | }
544 | ],
545 | "source": [
546 | "ggplot(\n",
547 | " aes(x = est, fill = p < 0.05),\n",
548 | " data = results\n",
549 | ") +\n",
550 | "geom_histogram()"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "## Overestimating effects"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": 7,
563 | "metadata": {
564 | "collapsed": false
565 | },
566 | "outputs": [
567 | {
568 | "data": {
569 | "text/plain": [
570 | " Min. 1st Qu. Median Mean 3rd Qu. Max. \n",
571 | " -0.694 -0.036 0.100 0.099 0.231 0.980 "
572 | ]
573 | },
574 | "execution_count": 7,
575 | "metadata": {},
576 | "output_type": "execute_result"
577 | }
578 | ],
579 | "source": [
580 | "summary(results$est)"
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": 8,
586 | "metadata": {
587 | "collapsed": false
588 | },
589 | "outputs": [
590 | {
591 | "data": {
592 | "text/plain": [
593 | " Min. 1st Qu. Median Mean 3rd Qu. Max. \n",
594 | " -0.694 0.407 0.452 0.395 0.514 0.980 "
595 | ]
596 | },
597 | "execution_count": 8,
598 | "metadata": {},
599 | "output_type": "execute_result"
600 | }
601 | ],
602 | "source": [
603 | "summary(results$est[results$p < 0.05])"
604 | ]
605 | },
606 | {
607 | "cell_type": "markdown",
608 | "metadata": {},
609 | "source": [
610 | "Gelman and Carlin call the \"exaggeration factor: the expected (absolute) value of the estimate divided by the effect size, if it is statistically significantly different from zero.\""
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": 9,
616 | "metadata": {
617 | "collapsed": false
618 | },
619 | "outputs": [
620 | {
621 | "data": {
622 | "text/html": [
623 | "0.476724715735216"
624 | ],
625 | "text/latex": [
626 | "0.476724715735216"
627 | ],
628 | "text/markdown": [
629 | "0.476724715735216"
630 | ],
631 | "text/plain": [
632 | "[1] 0.477"
633 | ]
634 | },
635 | "execution_count": 9,
636 | "metadata": {},
637 | "output_type": "execute_result"
638 | }
639 | ],
640 | "source": [
641 | "mean(abs(results$est[results$p < 0.05]))"
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": 10,
647 | "metadata": {
648 | "collapsed": false
649 | },
650 | "outputs": [
651 | {
652 | "data": {
653 | "text/html": [
654 | "4.76724715735216"
655 | ],
656 | "text/latex": [
657 | "4.76724715735216"
658 | ],
659 | "text/markdown": [
660 | "4.76724715735216"
661 | ],
662 | "text/plain": [
663 | "[1] 4.77"
664 | ]
665 | },
666 | "execution_count": 10,
667 | "metadata": {},
668 | "output_type": "execute_result"
669 | }
670 | ],
671 | "source": [
672 | "mean(abs(results$est[results$p < 0.05])) / .1"
673 | ]
674 | },
675 | {
676 | "cell_type": "markdown",
677 | "metadata": {},
678 | "source": [
679 | "That is, we get estimates that are almost 5 times too large.\n",
680 | "\n",
681 | "We also get estimates with the wrong sign."
682 | ]
683 | },
684 | {
685 | "cell_type": "code",
686 | "execution_count": 11,
687 | "metadata": {
688 | "collapsed": false
689 | },
690 | "outputs": [
691 | {
692 | "data": {
693 | "text/plain": [
694 | "\n",
695 | " -1 1 \n",
696 | " 75 740 "
697 | ]
698 | },
699 | "execution_count": 11,
700 | "metadata": {},
701 | "output_type": "execute_result"
702 | }
703 | ],
704 | "source": [
705 | "table(sign(results$est[results$p < 0.05]))"
706 | ]
707 | },
708 | {
709 | "cell_type": "code",
710 | "execution_count": 12,
711 | "metadata": {
712 | "collapsed": false
713 | },
714 | "outputs": [
715 | {
716 | "data": {
717 | "text/html": [
718 | "0.0920245398773006"
719 | ],
720 | "text/latex": [
721 | "0.0920245398773006"
722 | ],
723 | "text/markdown": [
724 | "0.0920245398773006"
725 | ],
726 | "text/plain": [
727 | "[1] 0.092"
728 | ]
729 | },
730 | "execution_count": 12,
731 | "metadata": {},
732 | "output_type": "execute_result"
733 | }
734 | ],
735 | "source": [
736 | "mean(results$est[results$p < 0.05] < 0)"
737 | ]
738 | },
739 | {
740 | "cell_type": "markdown",
741 | "metadata": {},
742 | "source": [
743 | "This isn't the worst possible case. Many experiments have even lower power.\n",
744 | "\n",
745 | "### In combination with bad stopping rules\n",
746 | "\n",
747 | "Low power also makes the use of stopping rules more problematic.\n",
748 | "\n",
749 | "Here we do a simulation where we start with some n in each of treatment and control. Then check whether we have a significant result. If not, add some incremental n to each condition. Repeat until there is a significant result or we hit a maximum n."
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": 13,
755 | "metadata": {
756 | "collapsed": true
757 | },
758 | "outputs": [],
759 | "source": [
760 | "add.data <- function(n, d, z = c(), y = c()) {\n",
761 | " z.new <- rep(0:1, each = n)\n",
762 | " y.new <- z.new * d + rnorm(n * 2)\n",
763 | " z <- c(z, z.new)\n",
764 | " y <- c(y, y.new)\n",
765 | " t <- t.test(y ~ z)\n",
766 | " list(z = z, y = y, t = t)\n",
767 | "}\n",
768 | "\n",
769 | "do.stopping.rule.sim <- function(n, d, n.inc, n.max) {\n",
770 | " r <- list(z = c(), y = c())\n",
771 | " while (TRUE) {\n",
772 | " nc <- length(r$z) / 2\n",
773 | " if (nc + n.inc >= n.max)\n",
774 | " break;\n",
775 | " ni <- if(nc == 0) n else n.inc\n",
776 | " r <- add.data(ni, d, r$z, r$y)\n",
777 | " if (r$t$p.value < 0.05)\n",
778 | " break;\n",
779 | " }\n",
780 | " r\n",
781 | "}"
782 | ]
783 | },
784 | {
785 | "cell_type": "code",
786 | "execution_count": 14,
787 | "metadata": {
788 | "collapsed": false
789 | },
790 | "outputs": [
791 | {
792 | "data": {
793 | "text/plain": [
794 | "\n",
795 | "\tWelch Two Sample t-test\n",
796 | "\n",
797 | "data: y by z\n",
798 | "t = -0.7, df = 200, p-value = 0.5\n",
799 | "alternative hypothesis: true difference in means is not equal to 0\n",
800 | "95 percent confidence interval:\n",
801 | " -0.361 0.172\n",
802 | "sample estimates:\n",
803 | "mean in group 0 mean in group 1 \n",
804 | " -0.0166 0.0780 \n"
805 | ]
806 | },
807 | "execution_count": 14,
808 | "metadata": {},
809 | "output_type": "execute_result"
810 | }
811 | ],
812 | "source": [
813 | "# one run, with no true effect\n",
814 | "do.stopping.rule.sim(n = 20, d = 0, n.inc = 2, n.max = 120)$t"
815 | ]
816 | },
817 | {
818 | "cell_type": "markdown",
819 | "metadata": {},
820 | "source": [
821 | "Run many simulations of experiments run with this stopping rule:"
822 | ]
823 | },
824 | {
825 | "cell_type": "code",
826 | "execution_count": 18,
827 | "metadata": {
828 | "collapsed": true
829 | },
830 | "outputs": [],
831 | "source": [
832 | "set.seed(8001)\n",
833 | "sr <- foreach(i = 1:1e3, .combine = rbind) %dopar% {\n",
834 | " r <- do.stopping.rule.sim(20, 0, 5, 50)\n",
835 | " data.frame(est = r$t$estimate[2] - r$t$estimate[1],\n",
836 | " p = r$t$p.value,\n",
837 | " n = length(r$z)\n",
838 | " )\n",
839 | " }"
840 | ]
841 | },
842 | {
843 | "cell_type": "code",
844 | "execution_count": 19,
845 | "metadata": {
846 | "collapsed": false
847 | },
848 | "outputs": [
849 | {
850 | "data": {
851 | "text/plain": [
852 | " est p n \n",
853 | " Min. :-1.073 Min. :0.000 Min. :40.0 \n",
854 | " 1st Qu.:-0.154 1st Qu.:0.216 1st Qu.:90.0 \n",
855 | " Median :-0.022 Median :0.506 Median :90.0 \n",
856 | " Mean :-0.015 Mean :0.486 Mean :85.7 \n",
857 | " 3rd Qu.: 0.128 3rd Qu.:0.751 3rd Qu.:90.0 \n",
858 | " Max. : 1.003 Max. :0.998 Max. :90.0 "
859 | ]
860 | },
861 | "execution_count": 19,
862 | "metadata": {},
863 | "output_type": "execute_result"
864 | }
865 | ],
866 | "source": [
867 | "summary(sr)"
868 | ]
869 | },
870 | {
871 | "cell_type": "code",
872 | "execution_count": 20,
873 | "metadata": {
874 | "collapsed": false
875 | },
876 | "outputs": [
877 | {
878 | "data": {
879 | "text/html": [
880 | "0.13"
881 | ],
882 | "text/latex": [
883 | "0.13"
884 | ],
885 | "text/markdown": [
886 | "0.13"
887 | ],
888 | "text/plain": [
889 | "[1] 0.13"
890 | ]
891 | },
892 | "execution_count": 20,
893 | "metadata": {},
894 | "output_type": "execute_result"
895 | }
896 | ],
897 | "source": [
898 | "mean(sr$p < 0.05)"
899 | ]
900 | },
901 | {
902 | "cell_type": "markdown",
903 | "metadata": {},
904 | "source": [
905 | "That is, rather than Type I error rate of 5%, we have 13%."
906 | ]
907 | },
908 | {
909 | "cell_type": "code",
910 | "execution_count": null,
911 | "metadata": {
912 | "collapsed": true
913 | },
914 | "outputs": [],
915 | "source": []
916 | }
917 | ],
918 | "metadata": {
919 | "kernelspec": {
920 | "display_name": "R",
921 | "language": "R",
922 | "name": "ir"
923 | },
924 | "language_info": {
925 | "codemirror_mode": "r",
926 | "file_extension": ".r",
927 | "mimetype": "text/x-r-source",
928 | "name": "R",
929 | "pygments_lexer": "r",
930 | "version": "3.2.2"
931 | }
932 | },
933 | "nbformat": 4,
934 | "nbformat_minor": 0
935 | }
936 |
--------------------------------------------------------------------------------
/Randomization inference in networks.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Randomization inference for spillovers in networks\n",
8 | "\n",
9 | "This uses data from:\n",
10 | "Cai, Jing, Alain De Janvry, and Elisabeth Sadoulet. 2015. \"Social Networks and the Decision to Insure.\" American Economic Journal: Applied Economics, 7(2): 81-108.\n",
11 | "https://www.aeaweb.org/articles.php?doi=10.1257/app.20130442\n",
12 | "\n",
13 | "That paper examines spillover effects in rural Chinese farmers being encouraged to sign up for insurance. Households were randomly assigned to different periods in which to be encouraged to get insurance and whether that encouragement was 'intensive'.\n",
14 | "\n",
15 | "\"The social network survey asked household heads to list five close friends, either within or outside the village, with whom they most frequently discuss rice production or financial issues. Respondents were asked to rank these friends based on which one would be consulted first, second, etc.\"\n",
16 | "\n",
17 | "We are essentially re-doing Table 2 column 2 (there are some minor differences because of how we have simplified things a bit).\n"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 8,
23 | "metadata": {
24 | "collapsed": true
25 | },
26 | "outputs": [],
27 | "source": [
28 | "options(digits = 3)\n",
29 | "library(ggplot2)\n",
30 | "theme_set(theme_bw())\n",
31 | "options(repr.plot.width = 6)\n",
32 | "options(repr.plot.height = 4)\n",
33 | "\n",
34 | "library(icsw)\n",
35 | "library(foreach)\n",
36 | "library(Matrix)\n",
37 | "library(lfe)"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 9,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [],
47 | "source": [
48 | "cai <- read.table(\"cai_data/cai.main.tsv\", sep = \"\\t\")"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 10,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [
58 | {
59 | "data": {
60 | "text/html": [
61 | "\n",
62 | " | id | address | region | village | takeup_survey | age | male | delay | intensive | info_none | intensive.nondelay.peers | n.peers |
\n",
63 | "\n",
64 | "\t1 | 1111385 | fusheng67 | 1 | fusheng | 1 | 37 | 1 | 0 | 1 | 1 | 0 | 4 |
\n",
65 | "\t2 | 1111035 | fusheng21 | 1 | fusheng | 1 | 60 | 1 | 0 | 1 | 1 | 0 | 0 |
\n",
66 | "\t3 | 1111363 | fusheng5 | 1 | fusheng | 0 | 56 | 1 | 0 | 0 | 1 | 0 | 3 |
\n",
67 | "\t4 | 1111042 | fusheng21 | 1 | fusheng | 1 | 57 | 1 | 0 | 0 | 1 | 1 | 1 |
\n",
68 | "\t5 | 1111045 | fusheng21 | 1 | fusheng | 1 | 45 | 1 | 1 | 0 | 1 | 2 | 4 |
\n",
69 | "\t6 | 1111038 | fusheng21 | 1 | fusheng | 1 | 61 | 1 | 1 | 1 | 1 | 0 | 4 |
\n",
70 | "\n",
71 | "
\n"
72 | ],
73 | "text/latex": [
74 | "\\begin{tabular}{r|llllllllllll}\n",
75 | " & id & address & region & village & takeup_survey & age & male & delay & intensive & info_none & intensive.nondelay.peers & n.peers\\\\\n",
76 | "\\hline\n",
77 | "\t1 & 1111385 & fusheng67 & 1 & fusheng & 1 & 37 & 1 & 0 & 1 & 1 & 0 & 4\\\\\n",
78 | "\t2 & 1111035 & fusheng21 & 1 & fusheng & 1 & 60 & 1 & 0 & 1 & 1 & 0 & 0\\\\\n",
79 | "\t3 & 1111363 & fusheng5 & 1 & fusheng & 0 & 56 & 1 & 0 & 0 & 1 & 0 & 3\\\\\n",
80 | "\t4 & 1111042 & fusheng21 & 1 & fusheng & 1 & 57 & 1 & 0 & 0 & 1 & 1 & 1\\\\\n",
81 | "\t5 & 1111045 & fusheng21 & 1 & fusheng & 1 & 45 & 1 & 1 & 0 & 1 & 2 & 4\\\\\n",
82 | "\t6 & 1111038 & fusheng21 & 1 & fusheng & 1 & 61 & 1 & 1 & 1 & 1 & 0 & 4\\\\\n",
83 | "\\end{tabular}\n"
84 | ],
85 | "text/plain": [
86 | " id address region village takeup_survey age male delay intensive info_none intensive.nondelay.peers n.peers\n",
87 | "1 1111385 fusheng67 1 fusheng 1 37 1 0 1 1 0 4\n",
88 | "2 1111035 fusheng21 1 fusheng 1 60 1 0 1 1 0 0\n",
89 | "3 1111363 fusheng5 1 fusheng 0 56 1 0 0 1 0 3\n",
90 | "4 1111042 fusheng21 1 fusheng 1 57 1 0 0 1 1 1\n",
91 | "5 1111045 fusheng21 1 fusheng 1 45 1 1 0 1 2 4\n",
92 | "6 1111038 fusheng21 1 fusheng 1 61 1 1 1 1 0 4"
93 | ]
94 | },
95 | "execution_count": 10,
96 | "metadata": {},
97 | "output_type": "execute_result"
98 | }
99 | ],
100 | "source": [
101 | "head(cai)"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "The main outcome is whether they sign for insurance `takeup_survey`."
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 11,
114 | "metadata": {
115 | "collapsed": false,
116 | "scrolled": true
117 | },
118 | "outputs": [
119 | {
120 | "name": "stderr",
121 | "output_type": "stream",
122 | "text": [
123 | " [[ suppressing 10 column names ‘1111385’, ‘1111035’, ‘1111363’ ... ]]\n",
124 | " [[ suppressing 10 column names ‘1111385’, ‘1111035’, ‘1111363’ ... ]]\n"
125 | ]
126 | },
127 | {
128 | "data": {
129 | "text/plain": [
130 | "10 x 10 sparse Matrix of class \"dgCMatrix\"\n",
131 | " \n",
132 | "1111385 . . . . . . . . . .\n",
133 | "1111035 . . . . . . . . . .\n",
134 | "1111363 . . . . . . . . . .\n",
135 | "1111042 . . . . . . . . . .\n",
136 | "1111045 . . . . . . . 1 . .\n",
137 | "1111038 . . . . . . . 1 1 .\n",
138 | "1111034 . . . . . . . . . 1\n",
139 | "1111055 . . . . 1 . . . . .\n",
140 | "1111050 . . . . 1 1 . 1 . 1\n",
141 | "1111031 . . . . 1 . . 1 . ."
142 | ]
143 | },
144 | "execution_count": 11,
145 | "metadata": {},
146 | "output_type": "execute_result"
147 | }
148 | ],
149 | "source": [
150 | "load(\"cai_data/cai.adjacency.RData\")\n",
151 | "A[1:10,1:10]"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "Now let's estimate the relationship between how many peers were given the strong encouragement to sign up for insurance in the prior period, `intensive.nondelay.peers` and the outcome `takeup_survey`. We will only do this for egos who didn't get the treatment in the prior period and who didn't receive information, as part of their treatment, about the adoption rates in their area. (This is also what the paper does.)"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "Now we can get a point estimate for the effects of peer treatments:"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 12,
171 | "metadata": {
172 | "collapsed": false
173 | },
174 | "outputs": [
175 | {
176 | "data": {
177 | "text/plain": [
178 | "\n",
179 | "Call:\n",
180 | "lm(formula = takeup_survey ~ intensive + I(intensive.nondelay.peers/n.peers) + \n",
181 | " factor(n.peers), data = cai)\n",
182 | "\n",
183 | "Residuals:\n",
184 | " Min 1Q Median 3Q Max \n",
185 | "-0.576 -0.451 -0.373 0.533 0.688 \n",
186 | "\n",
187 | "Coefficients:\n",
188 | " Estimate Std. Error t value Pr(>|t|) \n",
189 | "(Intercept) 0.3116 0.0340 9.16 < 2e-16 ***\n",
190 | "intensive 0.0779 0.0148 5.26 1.5e-07 ***\n",
191 | "I(intensive.nondelay.peers/n.peers) 0.0508 0.0295 1.72 0.08469 . \n",
192 | "factor(n.peers)2 0.0563 0.0389 1.45 0.14796 \n",
193 | "factor(n.peers)3 0.0610 0.0361 1.69 0.09071 . \n",
194 | "factor(n.peers)4 0.0833 0.0350 2.38 0.01746 * \n",
195 | "factor(n.peers)5 0.1356 0.0354 3.83 0.00013 ***\n",
196 | "---\n",
197 | "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
198 | "\n",
199 | "Residual standard error: 0.494 on 4514 degrees of freedom\n",
200 | " (256 observations deleted due to missingness)\n",
201 | "Multiple R-squared: 0.0124,\tAdjusted R-squared: 0.0111 \n",
202 | "F-statistic: 9.42 on 6 and 4514 DF, p-value: 2.62e-10\n"
203 | ]
204 | },
205 | "execution_count": 12,
206 | "metadata": {},
207 | "output_type": "execute_result"
208 | }
209 | ],
210 | "source": [
211 | "lm.1 <- lm(\n",
212 | " takeup_survey ~ intensive + I(intensive.nondelay.peers/n.peers) + factor(n.peers),\n",
213 | " data = cai\n",
214 | ")\n",
215 | "summary(lm.1)"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "There could be something here. We see both an effect of one's own treatment, and an effect of the fraction of peers treated in the early period.\n",
223 | "\n",
224 | "Note that depending on your number of peers, the fraction of treated peers can only take on some values. This violates the 'positivity' support assumption for causal inference. Even if this worked with the number of treated peers, and considered only 0 or more than 0 treated peers, the propensity scores would be heterogeneous. The author attempt to deal with this by adding the indicators for each number of friends.\n",
225 | "\n",
226 | "Make a function that, given the data (or permuted data) computes our test statistic -- the regression coefficient from above. We can see that it gives the same results as before:"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 13,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [
236 | {
237 | "name": "stdout",
238 | "output_type": "stream",
239 | "text": [
240 | "I(z.peers/n.peers) \n",
241 | " 0.0508 \n"
242 | ]
243 | }
244 | ],
245 | "source": [
246 | "peer.regression.coef <- function(z, z.peers, n.peers, y) {\n",
247 | " coef(lm(y ~ z + I(z.peers / n.peers) + factor(n.peers)))[3]\n",
248 | "}\n",
249 | "\n",
250 | "obs.coef <- with(\n",
251 | " cai,\n",
252 | " peer.regression.coef(intensive, intensive.nondelay.peers, n.peers, takeup_survey)\n",
253 | ")\n",
254 | "print(obs.coef)"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "Now we write a function to do the focal-auxillary permuation and compute the test statistic for each permutation."
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 14,
267 | "metadata": {
268 | "collapsed": true
269 | },
270 | "outputs": [],
271 | "source": [
272 | "do.focal.aux.permutation <- function(adj.mat, z, n.peers, y,\n",
273 | " is.focal, R = 1e3,\n",
274 | " fnc = peer.regression.coef) {\n",
275 | " foreach(i = 1:R, .combine = 'c') %do% {\n",
276 | " zp <- z\n",
277 | " zp[!is.focal] <- sample(z[!is.focal]) # permute treatments for auxillary vertices only\n",
278 | " zp.peers <- as.vector(adj.mat %*% zp) # re-compute number of peers treated\n",
279 | " fnc(z[is.focal], zp.peers[is.focal], n.peers[is.focal], y[is.focal])\n",
280 | " }\n",
281 | "}\n"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "We can now call this function to draw from the distribution of the test statistic under the null of no spillovers (but possible direct effects).\n",
289 | "\n",
290 | "Just for illustration, let's start by just selecting a random 2000 units as focal units."
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "Actually in the paper, they mainly don't focus on contemporaneous influence. Rather the authors look for effects of the assignment of peers treated in period 1 on egos only assigned in round 2."
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 15,
303 | "metadata": {
304 | "collapsed": false
305 | },
306 | "outputs": [
307 | {
308 | "data": {
309 | "text/plain": [
310 | " Min. 1st Qu. Median Mean 3rd Qu. Max. \n",
311 | "-0.0371 0.0434 0.0661 0.0650 0.0883 0.1760 "
312 | ]
313 | },
314 | "execution_count": 15,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "\n",
321 | "cai$is.focal <- sample(c(rep(TRUE, 2000), rep(FALSE, nrow(cai) - 2000)))\n",
322 | "\n",
323 | "null.coefs <- do.focal.aux.permutation(\n",
324 | " A,\n",
325 | " cai$intensive,\n",
326 | " cai$n.peers, cai$takeup_survey,\n",
327 | " cai$is.focal,\n",
328 | " R = 1e3\n",
329 | ")\n",
330 | "\n",
331 | "summary(null.coefs)"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 16,
337 | "metadata": {
338 | "collapsed": false
339 | },
340 | "outputs": [
341 | {
342 | "name": "stdout",
343 | "output_type": "stream",
344 | "text": [
345 | "I(z.peers/n.peers) \n",
346 | " 0.0449 \n"
347 | ]
348 | }
349 | ],
350 | "source": [
351 | "obs.coef <- with(\n",
352 | " subset(cai, is.focal),\n",
353 | " peer.regression.coef(intensive, intensive.nondelay.peers, n.peers, takeup_survey)\n",
354 | ")\n",
355 | "print(obs.coef)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 17,
361 | "metadata": {
362 | "collapsed": false
363 | },
364 | "outputs": [
365 | {
366 | "data": {
367 | "text/html": [
368 | "0.526"
369 | ],
370 | "text/latex": [
371 | "0.526"
372 | ],
373 | "text/markdown": [
374 | "0.526"
375 | ],
376 | "text/plain": [
377 | "[1] 0.526"
378 | ]
379 | },
380 | "execution_count": 17,
381 | "metadata": {},
382 | "output_type": "execute_result"
383 | }
384 | ],
385 | "source": [
386 | "two.sided.p.value.perm <- function(obs, null.draws) {\n",
387 | " lower.p <- mean(obs > null.draws)\n",
388 | " upper.p <- mean(obs < null.draws)\n",
389 | " 2 * min(lower.p, upper.p)\n",
390 | "}\n",
391 | "\n",
392 | "two.sided.p.value.perm(obs.coef, null.coefs)"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "metadata": {},
398 | "source": [
399 | "So actually this doesn't look statistically significant...\n",
400 | "\n",
401 | "But, in fact, in the paper they focus their analysis a bit more in a couple ways. They can restrict attention to households treated in the second period.\n",
402 | "\n",
403 | "Another way is to restrict attention to households that were not, as part of the treatment, randomly assigned to get information about insurance adoption in their area. Perhaps that social information would reduce the impact of other social info. These are identified with `info_none == 1` if you want to try that."
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 18,
409 | "metadata": {
410 | "collapsed": false
411 | },
412 | "outputs": [
413 | {
414 | "name": "stdout",
415 | "output_type": "stream",
416 | "text": [
417 | "I(z.peers/n.peers) \n",
418 | " 0.0508 \n"
419 | ]
420 | }
421 | ],
422 | "source": [
423 | "# a relevant subset\n",
424 | "cai$relevant.case <- with(cai, delay == 1 )\n",
425 | "\n",
426 | "obs.coef <- with(\n",
427 | " cai,\n",
428 | " peer.regression.coef(intensive, intensive.nondelay.peers, n.peers, takeup_survey)\n",
429 | ")\n",
430 | "print(obs.coef)"
431 | ]
432 | },
433 | {
434 | "cell_type": "markdown",
435 | "metadata": {},
436 | "source": [
437 | "Since only some units have relevant outcomes now, we can make all of them the focal units."
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 19,
443 | "metadata": {
444 | "collapsed": false
445 | },
446 | "outputs": [
447 | {
448 | "data": {
449 | "text/plain": [
450 | " Min. 1st Qu. Median Mean 3rd Qu. Max. \n",
451 | "-0.0155 0.0467 0.0632 0.0630 0.0787 0.1340 "
452 | ]
453 | },
454 | "execution_count": 19,
455 | "metadata": {},
456 | "output_type": "execute_result"
457 | }
458 | ],
459 | "source": [
460 | "cai$is.focal <- cai$relevant.case\n",
461 | "\n",
462 | "null.coefs <- do.focal.aux.permutation(\n",
463 | " A,\n",
464 | " cai$intensive,\n",
465 | " cai$n.peers, cai$takeup_survey,\n",
466 | " cai$is.focal,\n",
467 | " R = 1e3\n",
468 | ")\n",
469 | "\n",
470 | "summary(null.coefs)"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 20,
476 | "metadata": {
477 | "collapsed": false
478 | },
479 | "outputs": [
480 | {
481 | "name": "stdout",
482 | "output_type": "stream",
483 | "text": [
484 | "I(z.peers/n.peers) \n",
485 | " 0.169 \n"
486 | ]
487 | }
488 | ],
489 | "source": [
490 | "obs.coef <- with(\n",
491 | " subset(cai, is.focal),\n",
492 | " peer.regression.coef(intensive, intensive.nondelay.peers, n.peers, takeup_survey)\n",
493 | ")\n",
494 | "print(obs.coef)"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 21,
500 | "metadata": {
501 | "collapsed": false
502 | },
503 | "outputs": [
504 | {
505 | "data": {
506 | "text/html": [
507 | "0"
508 | ],
509 | "text/latex": [
510 | "0"
511 | ],
512 | "text/markdown": [
513 | "0"
514 | ],
515 | "text/plain": [
516 | "[1] 0"
517 | ]
518 | },
519 | "execution_count": 21,
520 | "metadata": {},
521 | "output_type": "execute_result"
522 | }
523 | ],
524 | "source": [
525 | "two.sided.p.value.perm(obs.coef, null.coefs)"
526 | ]
527 | },
528 | {
529 | "cell_type": "markdown",
530 | "metadata": {},
531 | "source": [
532 | "So the p-value is very close to 0. For this subpopulation, there is strong evidence of a spillover effect."
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": null,
538 | "metadata": {
539 | "collapsed": true
540 | },
541 | "outputs": [],
542 | "source": []
543 | }
544 | ],
545 | "metadata": {
546 | "kernelspec": {
547 | "display_name": "R",
548 | "language": "R",
549 | "name": "ir"
550 | },
551 | "language_info": {
552 | "codemirror_mode": "r",
553 | "file_extension": ".r",
554 | "mimetype": "text/x-r-source",
555 | "name": "R",
556 | "pygments_lexer": "r",
557 | "version": "3.2.2"
558 | }
559 | },
560 | "nbformat": 4,
561 | "nbformat_minor": 0
562 | }
563 |
--------------------------------------------------------------------------------
/Using covariates.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Using covariates to increase precision"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 67,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [],
17 | "source": [
18 | "options(digits = 2)\n",
19 | "library(foreach)\n",
20 | "library(doMC)\n",
21 | "registerDoMC(cores = 4)\n",
22 | "library(ggplot2)\n",
23 | "theme_set(theme_bw())\n",
24 | "library(repr)\n",
25 | "options(repr.plot.width=6, repr.plot.height=4)"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "Synthetic Facebook data set from https://www.udacity.com/wiki/ud651#!#data-sets.\n",
33 | "We are working with a subset of this to make the simulations fast."
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 42,
39 | "metadata": {
40 | "collapsed": false
41 | },
42 | "outputs": [],
43 | "source": [
44 | "ps <- read.table(\"pseudo_facebook_small.tsv\", header = TRUE)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 72,
50 | "metadata": {
51 | "collapsed": false
52 | },
53 | "outputs": [
54 | {
55 | "data": {
56 | "text/html": [
57 | "\n",
58 | "\t- 10000
\n",
59 | "\t- 15
\n",
60 | "
\n"
61 | ],
62 | "text/latex": [
63 | "\\begin{enumerate*}\n",
64 | "\\item 10000\n",
65 | "\\item 15\n",
66 | "\\end{enumerate*}\n"
67 | ],
68 | "text/markdown": [
69 | "1. 10000\n",
70 | "2. 15\n",
71 | "\n",
72 | "\n"
73 | ],
74 | "text/plain": [
75 | "[1] 10000 15"
76 | ]
77 | },
78 | "execution_count": 72,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | },
82 | {
83 | "data": {
84 | "text/plain": [
85 | " Min. 1st Qu. Median Mean 3rd Qu. Max. \n",
86 | " 0 1 10 149 76 13600 "
87 | ]
88 | },
89 | "execution_count": 72,
90 | "metadata": {},
91 | "output_type": "execute_result"
92 | },
93 | {
94 | "data": {
95 | "text/plain": [
96 | " Min. 1st Qu. Median Mean 3rd Qu. Max. \n",
97 | " 0 229 411 539 672 2820 "
98 | ]
99 | },
100 | "execution_count": 72,
101 | "metadata": {},
102 | "output_type": "execute_result"
103 | }
104 | ],
105 | "source": [
106 | "dim(ps)\n",
107 | "summary(ps$likes)\n",
108 | "summary(ps$tenure)"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "# Simulate estimates under the sharp null\n",
116 | "Simulation that computes simple difference in means and regression adjustment estimators. We use two regression adjustments: one with a single linear term and one with an interaction term with the covariate centered. See Lin (2013) in Annals of Applied Statistics."
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 73,
122 | "metadata": {
123 | "collapsed": false
124 | },
125 | "outputs": [],
126 | "source": [
127 | "do.sim <- function(x, y, nt = round(n / 2)) {\n",
128 | " n <- length(x)\n",
129 | " \n",
130 | " z.c <- rep(0, n)\n",
131 | " z.c[sample.int(n, nt)] <- 1\n",
132 | " \n",
133 | " tau.sd <- unname(coef(lm(y ~ z.c))[2]) # unadjusted\n",
134 | " tau.adj <- unname(coef(lm(y ~ z.c + x))[2]) # regression adjustment\n",
135 | " \n",
136 | " # interaction w centered covariate:\n",
137 | " x0 <- x - mean(x)\n",
138 | " tau.adj.int <- unname(coef(lm(y ~ z.c * x0))[2])\n",
139 | " \n",
140 | " c(th.sd = tau.sd, th.adj = tau.adj, th.adj.int = tau.adj.int)\n",
141 | "}"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "Do a single simulation:"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 66,
154 | "metadata": {
155 | "collapsed": false,
156 | "scrolled": true
157 | },
158 | "outputs": [
159 | {
160 | "data": {
161 | "text/html": [
162 | "\n",
163 | "\t- th.sd
\n",
164 | "\t\t- 0.0513337038631001
\n",
165 | "\t- th.adj
\n",
166 | "\t\t- 0.0596912108653627
\n",
167 | "\t- th.adj.int
\n",
168 | "\t\t- 0.058737335222844
\n",
169 | "
\n"
170 | ],
171 | "text/latex": [
172 | "\\begin{description*}\n",
173 | "\\item[th.sd] 0.0513337038631001\n",
174 | "\\item[th.adj] 0.0596912108653627\n",
175 | "\\item[th.adj.int] 0.058737335222844\n",
176 | "\\end{description*}\n"
177 | ],
178 | "text/markdown": [
179 | "th.sd\n",
180 | ": 0.0513337038631001th.adj\n",
181 | ": 0.0596912108653627th.adj.int\n",
182 | ": 0.058737335222844\n",
183 | "\n"
184 | ],
185 | "text/plain": [
186 | " th.sd th.adj th.adj.int \n",
187 | " 0.051 0.060 0.059 "
188 | ]
189 | },
190 | "execution_count": 66,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | }
194 | ],
195 | "source": [
196 | "with(ps, do.sim(tenure, log1p(likes), nt = 2000))"
197 | ]
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "metadata": {},
202 | "source": [
203 | "The estimates, as we should expect, are quite similar. But we really want to compare their variance and error across many possible randomizations.\n",
204 | "\n",
205 | "Do many simulations:"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 59,
211 | "metadata": {
212 | "collapsed": false
213 | },
214 | "outputs": [],
215 | "source": [
216 | "sr <- foreach(i = 1:1e3, .combine = rbind) %dopar%\n",
217 | " with(ps, do.sim(tenure, log1p(likes), nt = 2000))"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 60,
223 | "metadata": {
224 | "collapsed": false
225 | },
226 | "outputs": [
227 | {
228 | "data": {
229 | "text/plain": [
230 | " th.sd th.adj th.adj.int \n",
231 | " Min. :-0.182 Min. :-0.182 Min. :-0.184 \n",
232 | " 1st Qu.:-0.039 1st Qu.:-0.038 1st Qu.:-0.038 \n",
233 | " Median : 0.000 Median : 0.000 Median : 0.000 \n",
234 | " Mean : 0.000 Mean : 0.000 Mean : 0.000 \n",
235 | " 3rd Qu.: 0.038 3rd Qu.: 0.036 3rd Qu.: 0.036 \n",
236 | " Max. : 0.191 Max. : 0.190 Max. : 0.190 "
237 | ]
238 | },
239 | "execution_count": 60,
240 | "metadata": {},
241 | "output_type": "execute_result"
242 | }
243 | ],
244 | "source": [
245 | "summary(sr)"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 61,
251 | "metadata": {
252 | "collapsed": false
253 | },
254 | "outputs": [
255 | {
256 | "data": {
257 | "text/html": [
258 | "\n",
259 | "\t- th.sd
\n",
260 | "\t\t- 0.0580209274316362
\n",
261 | "\t- th.adj
\n",
262 | "\t\t- 0.0573698971473993
\n",
263 | "\t- th.adj.int
\n",
264 | "\t\t- 0.0574078054448429
\n",
265 | "
\n"
266 | ],
267 | "text/latex": [
268 | "\\begin{description*}\n",
269 | "\\item[th.sd] 0.0580209274316362\n",
270 | "\\item[th.adj] 0.0573698971473993\n",
271 | "\\item[th.adj.int] 0.0574078054448429\n",
272 | "\\end{description*}\n"
273 | ],
274 | "text/markdown": [
275 | "th.sd\n",
276 | ": 0.0580209274316362th.adj\n",
277 | ": 0.0573698971473993th.adj.int\n",
278 | ": 0.0574078054448429\n",
279 | "\n"
280 | ],
281 | "text/plain": [
282 | " th.sd th.adj th.adj.int \n",
283 | " 0.058 0.057 0.057 "
284 | ]
285 | },
286 | "execution_count": 61,
287 | "metadata": {},
288 | "output_type": "execute_result"
289 | }
290 | ],
291 | "source": [
292 | "apply(sr, 2, sd)"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 62,
298 | "metadata": {
299 | "collapsed": false
300 | },
301 | "outputs": [
302 | {
303 | "data": {
304 | "text/html": [
305 | "\n",
306 | "\t- th.sd
\n",
307 | "\t\t- 0.000182805467572636
\n",
308 | "\t- th.adj
\n",
309 | "\t\t- 0.000250078723388885
\n",
310 | "\t- th.adj.int
\n",
311 | "\t\t- 0.000304896568073275
\n",
312 | "
\n"
313 | ],
314 | "text/latex": [
315 | "\\begin{description*}\n",
316 | "\\item[th.sd] 0.000182805467572636\n",
317 | "\\item[th.adj] 0.000250078723388885\n",
318 | "\\item[th.adj.int] 0.000304896568073275\n",
319 | "\\end{description*}\n"
320 | ],
321 | "text/markdown": [
322 | "th.sd\n",
323 | ": 0.000182805467572636th.adj\n",
324 | ": 0.000250078723388885th.adj.int\n",
325 | ": 0.000304896568073275\n",
326 | "\n"
327 | ],
328 | "text/plain": [
329 | " th.sd th.adj th.adj.int \n",
330 | " 0.00018 0.00025 0.00030 "
331 | ]
332 | },
333 | "execution_count": 62,
334 | "metadata": {},
335 | "output_type": "execute_result"
336 | }
337 | ],
338 | "source": [
339 | "apply(sr, 2, mean)"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 63,
345 | "metadata": {
346 | "collapsed": false
347 | },
348 | "outputs": [
349 | {
350 | "data": {
351 | "text/html": [
352 | "\n",
353 | "\t- th.sd
\n",
354 | "\t\t- 0.0579921978359688
\n",
355 | "\t- th.adj
\n",
356 | "\t\t- 0.0573417503480001
\n",
357 | "\t- th.adj.int
\n",
358 | "\t\t- 0.0573799044246692
\n",
359 | "
\n"
360 | ],
361 | "text/latex": [
362 | "\\begin{description*}\n",
363 | "\\item[th.sd] 0.0579921978359688\n",
364 | "\\item[th.adj] 0.0573417503480001\n",
365 | "\\item[th.adj.int] 0.0573799044246692\n",
366 | "\\end{description*}\n"
367 | ],
368 | "text/markdown": [
369 | "th.sd\n",
370 | ": 0.0579921978359688th.adj\n",
371 | ": 0.0573417503480001th.adj.int\n",
372 | ": 0.0573799044246692\n",
373 | "\n"
374 | ],
375 | "text/plain": [
376 | " th.sd th.adj th.adj.int \n",
377 | " 0.058 0.057 0.057 "
378 | ]
379 | },
380 | "execution_count": 63,
381 | "metadata": {},
382 | "output_type": "execute_result"
383 | }
384 | ],
385 | "source": [
386 | "rmses <- sqrt(apply(sr, 2, function(x) mean(x^2)))\n",
387 | "rmses"
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {},
393 | "source": [
394 | "Reduction in root mean squared error for tau:"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 64,
400 | "metadata": {
401 | "collapsed": false
402 | },
403 | "outputs": [
404 | {
405 | "data": {
406 | "text/html": [
407 | "\n",
408 | "\t- th.sd
\n",
409 | "\t\t- 0
\n",
410 | "\t- th.adj
\n",
411 | "\t\t- 0.0112161206548594
\n",
412 | "\t- th.adj.int
\n",
413 | "\t\t- 0.0105582032436756
\n",
414 | "
\n"
415 | ],
416 | "text/latex": [
417 | "\\begin{description*}\n",
418 | "\\item[th.sd] 0\n",
419 | "\\item[th.adj] 0.0112161206548594\n",
420 | "\\item[th.adj.int] 0.0105582032436756\n",
421 | "\\end{description*}\n"
422 | ],
423 | "text/markdown": [
424 | "th.sd\n",
425 | ": 0th.adj\n",
426 | ": 0.0112161206548594th.adj.int\n",
427 | ": 0.0105582032436756\n",
428 | "\n"
429 | ],
430 | "text/plain": [
431 | " th.sd th.adj th.adj.int \n",
432 | " 0.000 0.011 0.011 "
433 | ]
434 | },
435 | "execution_count": 64,
436 | "metadata": {},
437 | "output_type": "execute_result"
438 | }
439 | ],
440 | "source": [
441 | "1 - rmses / rmses[1]"
442 | ]
443 | },
444 | {
445 | "cell_type": "markdown",
446 | "metadata": {},
447 | "source": [
448 | "This isn't much reduction. That doesn't mean there isn't clear evidence of tenure being related to likes; we can clearly detect a relationships, but mainly because we have a large N, not because there is a strong association:"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 65,
454 | "metadata": {
455 | "collapsed": false
456 | },
457 | "outputs": [
458 | {
459 | "data": {
460 | "text/plain": [
461 | "\n",
462 | "Call:\n",
463 | "lm(formula = log1p(likes) ~ tenure, data = ps)\n",
464 | "\n",
465 | "Residuals:\n",
466 | " Min 1Q Median 3Q Max \n",
467 | "-3.925 -2.120 -0.217 1.659 6.745 \n",
468 | "\n",
469 | "Coefficients:\n",
470 | " Estimate Std. Error t value Pr(>|t|) \n",
471 | "(Intercept) 2.233249 0.033869 65.9 <2e-16 ***\n",
472 | "tenure 0.000795 0.000048 16.6 <2e-16 ***\n",
473 | "---\n",
474 | "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
475 | "\n",
476 | "Residual standard error: 2.2 on 9998 degrees of freedom\n",
477 | "Multiple R-squared: 0.0268,\tAdjusted R-squared: 0.0267 \n",
478 | "F-statistic: 275 on 1 and 9998 DF, p-value: <2e-16\n"
479 | ]
480 | },
481 | "execution_count": 65,
482 | "metadata": {},
483 | "output_type": "execute_result"
484 | }
485 | ],
486 | "source": [
487 | "summary(lm(log1p(likes) ~ tenure, data = ps))"
488 | ]
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": 68,
493 | "metadata": {
494 | "collapsed": false
495 | },
496 | "outputs": [
497 | {
498 | "name": "stderr",
499 | "output_type": "stream",
500 | "text": [
501 | "geom_smooth: method=\"auto\" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = \"cs\"). Use 'method = x' to change the smoothing method.\n"
502 | ]
503 | },
504 | {
505 | "data": {
506 | "image/png": "",
507 | "image/svg+xml": [
508 | "\n",
509 | "\n"
816 | ],
817 | "text/plain": [
818 | "plot without title"
819 | ]
820 | },
821 | "metadata": {
822 | "image/svg+xml": {
823 | "isolated": true
824 | }
825 | },
826 | "output_type": "display_data"
827 | }
828 | ],
829 | "source": [
830 | "ggplot(\n",
831 | " aes(x = log1p(tenure), y = log1p(likes)),\n",
832 | " data = ps\n",
833 | ") + \n",
834 | "geom_smooth()"
835 | ]
836 | },
837 | {
838 | "cell_type": "markdown",
839 | "metadata": {},
840 | "source": [
841 | "## Exercises\n",
842 | "Compare regression adjustment with post-stratification and blocking (ie pre-stratification).\n",
843 | "\n",
844 | "Conduct similar analysis of another data set."
845 | ]
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": null,
850 | "metadata": {
851 | "collapsed": true
852 | },
853 | "outputs": [],
854 | "source": []
855 | }
856 | ],
857 | "metadata": {
858 | "kernelspec": {
859 | "display_name": "R",
860 | "language": "R",
861 | "name": "ir"
862 | },
863 | "language_info": {
864 | "codemirror_mode": "r",
865 | "file_extension": ".r",
866 | "mimetype": "text/x-r-source",
867 | "name": "R",
868 | "pygments_lexer": "r",
869 | "version": "3.2.2"
870 | }
871 | },
872 | "nbformat": 4,
873 | "nbformat_minor": 0
874 | }
875 |
--------------------------------------------------------------------------------
/cai_data/cai.adjacency.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deaneckles/randomization_inference/97b851c079853b1d546acc68db7c5fd45cc82e3e/cai_data/cai.adjacency.RData
--------------------------------------------------------------------------------
/prep_data.R:
--------------------------------------------------------------------------------
1 | library(igraph)
2 | library(foreign)
3 | library(Matrix)
4 | library(lfe)
5 | library(dplyr)
6 | library(foreach)
7 |
8 | cain.all <- read.dta("cai_data/0422allinforawnet.dta")
9 | cais.all <- read.dta("cai_data/0422survey.dta")
10 |
11 | cai.all <- read.csv("cai_data/0422analysis.csv")
12 | cai.all$id <- as.character(cai.all$id)
13 |
14 | cain.all <- subset(cain.all, !is.na(network_id) & network_id != 99)
15 |
16 | cain <- cain.all
17 |
18 | cain.el <- apply(as.matrix(cain[, c("id", "network_id")]), 2, as.character)
19 | cain.el <- cain.el[!is.na(cain.el[, 2]), ]
20 |
21 | ids.in.net <- unique(c(cain.el[, 1], cain.el[, 2]))
22 | egos.in.net <- unique(cain.el[, 1])
23 | ids.in.survey <- unique(cai.all$id)
24 | ids.in.net.only <- ids.in.net[!ids.in.net %in% ids.in.survey]
25 | egos.in.net.only <- egos.in.net[!egos.in.net %in% ids.in.survey]
26 | ids.in.survey.only <- ids.in.survey[!ids.in.survey %in% ids.in.net]
27 |
28 | cai <- subset(cai.all, id %in% egos.in.net)
29 |
30 | g <- graph_from_edgelist(cain.el, directed = TRUE)
31 |
32 | cain.peer.summary <- cain[!duplicated(cain$network_id),] %>%
33 | select(
34 | id = network_id,
35 | village = network_village,
36 | address = network_address,
37 | takeup_survey, delay, intensive, understanding
38 | ) %>%
39 | mutate(id = as.character(id)) %>%
40 | filter(!is.na(intensive))
41 |
42 | peers.in.net <- unique(cain.peer.summary$id)
43 | ids.in.survey <- unique(cai.all$id)
44 | peers.in.net.only <- peers.in.net[!peers.in.net %in% ids.in.survey]
45 |
46 | caic <- bind_rows(
47 | cai,
48 | cain.peer.summary[cain.peer.summary$id %in% peers.in.net.only,]
49 | )
50 |
51 | head(vertex_attr(g, 'name'))
52 | tmp.name <- vertex_attr(g, 'name')
53 | caic$name <- caic$id
54 | vertex_attr(g, index = as.character(caic$id)) <- as.list(caic)
55 | vertex_attr(g, 'name') <- tmp.name
56 | head(vertex_attr(g, 'id'))
57 | head(vertex_attr(g, 'name'))
58 | head(vertex_attr(g, 'village'))
59 |
60 | g1 <- induced_subgraph(g, V(g)[which(!is.na(V(g)$intensive))])
61 |
62 | # make adjacency matrix
63 | A <- as_adj(g1, sparse = TRUE, names = T)
64 | A.df <- as.data.frame(vertex_attr(g1))
65 |
66 | A.df$intensive.0 <- ifelse(is.na(A.df$intensive), 0, A.df$intensive)
67 | A.df$delay.0 <- ifelse(is.na(A.df$delay), 0, A.df$delay)
68 | A.df$intensive.peers <- as.vector(A %*% A.df$intensive.0)
69 | A.df$intensive.nond.peers <- as.vector(A %*% (A.df$intensive.0 * (1-A.df$delay.0)))
70 | A.df$default.peers <- as.vector(A %*% A.df$default)
71 | A.df$n.peers <- rowSums(A)
72 | A.df$n.elig.peers <- as.vector(A %*% !is.na(A.df$intensive))
73 |
74 | # compare my counts with the data
75 | with(A.df, table(
76 | round(network_obs * network_rate_preintensive),
77 | intensive.nond.peers,
78 | useNA = "ifany"
79 | ))
80 |
81 | with(A.df, table(
82 | network_obs,
83 | n.peers,
84 | useNA = "ifany"
85 | ))
86 |
87 | with(A.df, table(
88 | n.peers,
89 | n.elig.peers
90 | ))
91 |
92 | tmp <- merge(
93 | cain.s, A.df %>% select(id, intensive.nond.peers, network_obs, network_rate_preintensive))
94 |
95 | with(tmp, table(
96 | round(network_obs * network_rate_preintensive),
97 | n.intensive.pre,
98 | useNA = "ifany"
99 | ))
100 |
101 | ###
102 | # write simplified data
103 | A.df.to.write <- A.df %>%
104 | select(
105 | id, address, region, village, takeup_survey, age, male,
106 | delay, intensive, info_none, intensive.nondelay.peers = intensive.nond.peers, n.peers
107 | )
108 |
109 | write.table(
110 | A.df.to.write,
111 | file = "cai_data/cai.main.tsv",
112 | row.names = TRUE,
113 | sep = "\t"
114 | )
115 |
116 | save(A, file = "cai_data/cai.adjacency.RData")
117 |
--------------------------------------------------------------------------------