├── .gitignore
├── conf
    ├── config
    ├── hdfs-site.xml
    ├── mapred-site.xml
    ├── core-site.xml
    └── yarn-site.xml
├── start-hadoop.sh
├── LICENSE
├── Dockerfile
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.tar.gz
2 | *.tgz
3 | *.zip
4 | 


--------------------------------------------------------------------------------
/conf/config:
--------------------------------------------------------------------------------
1 | Host *
2 |   UserKnownHostsFile /dev/null 
3 |   StrictHostKeyChecking no
4 | 


--------------------------------------------------------------------------------
/conf/hdfs-site.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |     <property>
3 |         <name>dfs.replication</name>
4 |         <value>1</value>
5 |     </property>
6 | </configuration>
7 | 


--------------------------------------------------------------------------------
/conf/mapred-site.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |     <property>
3 |         <name>mapreduce.framework.name</name>
4 |         <value>yarn</value>
5 |     </property>
6 | </configuration>
7 | 


--------------------------------------------------------------------------------
/conf/core-site.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |     <property>
3 |         <name>fs.defaultFS</name>
4 |         <value>hdfs://localhost:9000</value>
5 |     </property>
6 | </configuration>
7 | 


--------------------------------------------------------------------------------
/conf/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |     <property>
 3 |         <name>yarn.nodemanager.aux-services</name>
 4 |         <value>mapreduce_shuffle</value>
 5 |     </property>
 6 |     <property>
 7 |        <name>yarn.resourcemanager.address</name>
 8 |        <value>127.0.0.1:8032</value>
 9 |     </property>
10 | </configuration>
11 | 


--------------------------------------------------------------------------------
/start-hadoop.sh:
--------------------------------------------------------------------------------
 1 | 
 2 |  
 3 | # start ssh server
 4 | /etc/init.d/ssh start
 5 |  
 6 | # format namenode
 7 | $HADOOP_HOME/bin/hdfs namenode -format
 8 |  
 9 | # start hadoop
10 | $HADOOP_HOME/sbin/start-dfs.sh
11 | $HADOOP_HOME/sbin/start-yarn.sh
12 | $HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
13 |  
14 | # keep container running
15 | tail -f /dev/null
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Carneirão
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | # set environment vars
  4 | ENV HADOOP_BASE /opt/hadoop
  5 | ENV HADOOP_HOME /opt/hadoop/current
  6 | ENV HADOOP_VERSION=2.8.5
  7 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
  8 | ENV SPARK_BASE /opt/spark
  9 | ENV SPARK_HOME /opt/spark/current
 10 | ENV SPARK_VERSION=2.4.4
 11 | 
 12 | # configuring tz to avoid problems with interaction problems with tzdata package
 13 | ENV TZ=America/Sao_Paulo 
 14 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 15 | 
 16 | # Install packages
 17 | RUN \
 18 |   apt-get update && apt-get install -y \
 19 |   net-tools \
 20 |   sudo \
 21 |   curl \
 22 |   ssh \
 23 |   rsync \
 24 |   vim \
 25 |   openjdk-8-jdk \
 26 |   maven \
 27 |   python3-pip \
 28 |   jupyter-notebook
 29 | 
 30 | 
 31 | # download and extract hadoop, set JAVA_HOME in hadoop-env.sh, update path
 32 | RUN curl -L \
 33 | 	--progress-bar 'https://www-us.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz' \
 34 | 		-o "hadoop-$HADOOP_VERSION.tar.gz" 
 35 | 
 36 | COPY hadoop-$HADOOP_VERSION.tar.gz .
 37 | RUN mkdir -p $HADOOP_BASE \
 38 | 	&& tar -xzvmf hadoop-$HADOOP_VERSION.tar.gz -C $HADOOP_BASE/ \
 39 |  	&& cd $HADOOP_BASE \
 40 |  	&& ln -s hadoop-$HADOOP_VERSION current \
 41 |  	&& cd / \
 42 |  	&& echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
 43 |  	&& echo "PATH=$PATH:$HADOOP_HOME/bin" >> ~/.bashrc
 44 | 
 45 | # create ssh keys
 46 | RUN  ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa 
 47 | RUN  cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys 
 48 | RUN  chmod 0600 ~/.ssh/authorized_keys
 49 | 
 50 | # copy hadoop configs
 51 | COPY conf/*xml $HADOOP_HOME/etc/hadoop/
 52 | 
 53 | # copy ssh config
 54 | COPY conf/config /root/.ssh/config
 55 | 
 56 | # create hduser user
 57 | RUN useradd -m -s /bin/bash hduser \
 58 |  	&& groupadd hdfs \
 59 |  	&& usermod -aG hdfs hduser \
 60 |  	&& usermod -aG sudo hduser \
 61 |  	&& mkdir ~hduser/.ssh
 62 | 
 63 | # create ssh keys
 64 | RUN  ssh-keygen -t rsa -P '' -f ~hduser/.ssh/id_rsa \
 65 |  	&&  cat ~/.ssh/id_rsa.pub >> ~hduser/.ssh/authorized_keys \
 66 |  	&&  chmod 0600 ~hduser/.ssh/authorized_keys
 67 | 
 68 | # download and build spark with maven with Hive and hive-trhift support 
 69 | ENV MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"
 70 | RUN curl -L \
 71 | 	--progress-bar 'https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4.tgz' \
 72 | 		-o "spark-$SPARK_VERSION.tgz"
 73 | 
 74 | COPY spark-$SPARK_VERSION.tgz .
 75 | ENV SPARK_PART_VERSION=2.4
 76 | ENV HADOOP_PART_VERSION=2.8
 77 | 
 78 | RUN mkdir -p $SPARK_BASE && tar -xzmvf spark-$SPARK_VERSION.tgz \
 79 |  	&& cd spark-$SPARK_VERSION \
 80 |  	&& ./build/mvn \
 81 |   	-Pyarn -Phadoop-$HADOOP_PART_VERSION -Dhadoop.version=$HADOOP_VERSION \
 82 |   	-Phive -Phive-thriftserver \
 83 |   	-DskipTests clean package 
 84 | 
 85 | # Moving Spark after build dirs to $SPARK_HOME proving to be IMPOSSIBLE!
 86 | RUN cd /
 87 | RUN tar -cBpvzf spark-$SPARK_VERSION.tar.gz spark-$SPARK_VERSION
 88 | RUN tar -xzvmf spark-$SPARK_VERSION.tar.gz -C $SPARK_BASE/
 89 | RUN ln -s spark-$SPARK_VERSION $SPARK_HOME \
 90 |   	&& cd / 
 91 | 
 92 | # Install pyspark
 93 | RUN pip3 install pyspark
 94 | 
 95 | # Configuring ~hduser/.bashrc
 96 | RUN	echo "export JAVA_HOME=$JAVA_HOME" >> ~hduser/.bashrc \
 97 |  	&& echo "export HADOOP_HOME=$HADOOP_HOME" >> ~hduser/.bashrc \
 98 |  	&& echo "alias python='python3.6'" >> ~hduser/.bashrc \
 99 |  	&& echo "alias pip='pip3'" >> ~hduser/.bashrc \
100 |  	&& echo "export PYSPARK_PYTHON='python3.6'" >> ~hduser/.bashrc \
101 |  	&& echo "export SPARK_HOME=$SPARK_HOME" >> ~hduser/.bashrc \
102 |  	&& echo "export SPARK_MAJOR_VERSION=2" >> ~hduser/.bashrc \
103 |  	&& echo "export PATH=$PATH:$HADOOP_HOME/bin:$SPARK_HOME/bin" >> ~hduser/.bashrc
104 | 
105 | # copy script to start hadoop
106 | COPY start-hadoop.sh /start-hadoop.sh
107 | RUN bash start-hadoop.sh &
108 | 
109 | # Preparing HDFS for hduser
110 | RUN $HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/hduser
111 | RUN $HADOOP_HOME/bin/hdfs dfs -chown hduser /user/hduser
112 | 
113 | # Cleanup
114 | RUN rm -f *.tar.gz *.tgz *.sh 
115 | 
116 | # expose various ports
117 | EXPOSE 8088 8888 5000 50070 50075 50030 50060
118 | 
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # bigram-hadoop
  2 | 
  3 | A "bootstrap" image for pySpark developers.
  4 | 
  5 | 
  6 | ## Version
  7 | 
  8 | 0.0.8 
  9 | 
 10 | 
 11 | 
 12 | ## Introduction
 13 | 
 14 | This is a repository for Dockerfile and some components necessary to build an image that provides a minimal of environment to work with Hadoop and pySpark. 
 15 | 
 16 | 
 17 | 
 18 | ## Features
 19 | 
 20 | * Hadoop 2.8.5(Mapreduce + YARN + HDFS)
 21 | * Spark 2.4.4( Built using Maven because this combination of versions of Hadoop and Spark must be built )
 22 | * Python 3.6
 23 | * jupyter-notebook
 24 | 
 25 | 
 26 | 
 27 | ## Requirements
 28 | 
 29 | * Some Linux distro( I can't tell if this works on Windows nor MacOS. Probably yes on MacOS!)
 30 | 
 31 | * docker 19.03.5
 32 | 
 33 | * Dockerfile
 34 | 
 35 | * 16GB of RAM
 36 | 
 37 | * Intel Core i5 is ok but, i7 is recommended
 38 | 
 39 |   
 40 | 
 41 | ## Files, directories
 42 | 
 43 | ```bash
 44 | .
 45 | ├── conf
 46 | │   ├── config
 47 | │   ├── core-site.xml
 48 | │   ├── hdfs-site.xml
 49 | │   ├── mapred-site.xml
 50 | │   └── yarn-site.xml
 51 | ├── Dockerfile
 52 | ├── LICENSE
 53 | ├── README.md
 54 | └── start-hadoop.sh
 55 | 
 56 | ```
 57 | 
 58 | * conf/config: ssh configuration file
 59 | 
 60 | * conf/*-site.xml: hadoop basic configuration files
 61 | 
 62 | * Dockerfile: docker file for build the image/container
 63 | 
 64 | * start-hadoop.sh: script that starts Hadoop environment(zookeeper, hdfs, yarn, etc.)
 65 | 
 66 |   
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | ## Getting started
 73 | 
 74 | First of all, **install docker!**  
 75 | 
 76 | * [How to install docker on Ubuntu/Mint](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
 77 | 
 78 | 
 79 | 
 80 | Then, choose your "destiny"!
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | ### Dockerhub way
 87 | 
 88 | Faster, not so fun and **probably out of date**. But works! Just run *docker pull* as at bellow:
 89 | 
 90 | `docker pull carneiro/bigram-hadoop`
 91 | 
 92 | 
 93 | 
 94 | Dockerhub image site: [carneiro/bigram-hadoop](https://hub.docker.com/repository/docker/carneiro/bigram-hadoop)
 95 | 
 96 | 
 97 | 
 98 | ### Dockerfile way
 99 | 
100 | The Dockfile is certainly updated, but is very slow to run! 
101 | 
102 | 
103 | 
104 | What this will do?
105 | 
106 | 1. Install basic Linux image(Ubuntu18.04)
107 | 2. Install Hadoop 2.5.8 basic stack(HDFS,Yarn,HDFS, Hive etc)
108 | 3. Build Spark 2.4.4 using Maven and configure it
109 | 
110 | 
111 | 
112 | All stuff on [Github](https://github.com/bang/bigram-hadoop)
113 | 
114 | 
115 | 
116 | 1. Dockerfile:
117 | 
118 |    ```dockerfile
119 |    FROM ubuntu:18.04
120 |    
121 |    # set environment vars
122 |    ENV HADOOP_BASE /opt/hadoop
123 |    ENV HADOOP_HOME /opt/hadoop/current
124 |    ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
125 |    ENV SPARK_BASE /opt/spark
126 |    ENV SPARK_HOME /opt/spark/current
127 |    
128 |    # configuring tz to avoid problems with interaction problems with tzdata package
129 |    ENV TZ=America/Sao_Paulo 
130 |    RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
131 |    
132 |    # Install packages
133 |    RUN \
134 |      apt-get update && apt-get install -y \
135 |      net-tools \
136 |      sudo \
137 |      curl \
138 |      ssh \
139 |      rsync \
140 |      vim \
141 |      openjdk-8-jdk \
142 |      maven \
143 |      python3-pip \
144 |      jupyter-notebook
145 |    
146 |    
147 |    # download and extract hadoop, set JAVA_HOME in hadoop-env.sh, update path
148 |    #RUN curl -L \
149 |    #	--progress-bar 'https://www-us.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz' \
150 |    #		-o "hadoop-2.8.5.tar.gz" 
151 |    ENV HADOOP_VERSION=2.8.5
152 |    COPY hadoop-$HADOOP_VERSION.tar.gz .
153 |    RUN mkdir -p $HADOOP_BASE \
154 |    	&& tar -xzvmf hadoop-$HADOOP_VERSION.tar.gz -C $HADOOP_BASE/ \
155 |     	&& cd $HADOOP_BASE \
156 |     	&& ln -s hadoop-$HADOOP_VERSION current \
157 |     	&& cd / \
158 |     	&& echo "export JAVA_HOME=$JAVA_HOME" >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
159 |     	&& echo "PATH=$PATH:$HADOOP_HOME/bin" >> ~/.bashrc
160 |    
161 |    # create ssh keys
162 |    RUN  ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa 
163 |    RUN  cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys 
164 |    RUN  chmod 0600 ~/.ssh/authorized_keys
165 |    
166 |    # copy hadoop configs
167 |    COPY conf/*xml $HADOOP_HOME/etc/hadoop/
168 |    
169 |    # copy ssh config
170 |    COPY conf/config /root/.ssh/config
171 |    
172 |    # create hduser user
173 |    RUN useradd -m -s /bin/bash hduser \
174 |     	&& groupadd hdfs \
175 |     	&& usermod -aG hdfs hduser \
176 |     	&& usermod -aG sudo hduser \
177 |     	&& mkdir ~hduser/.ssh
178 |    
179 |    # create ssh keys
180 |    RUN  ssh-keygen -t rsa -P '' -f ~hduser/.ssh/id_rsa \
181 |     	&&  cat ~/.ssh/id_rsa.pub >> ~hduser/.ssh/authorized_keys \
182 |     	&&  chmod 0600 ~hduser/.ssh/authorized_keys
183 |    
184 |    # download and build spark with maven with Hive and hive-trhift support 
185 |    ENV MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m"
186 |    # RUN curl -L \
187 |    # # 	--progress-bar 'https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4.tgz' \
188 |    # # 		-o "spark-2.4.4.tgz"
189 |    
190 |    ENV SPARK_VERSION=2.4.4
191 |    
192 |    COPY spark-$SPARK_VERSION.tgz .
193 |    ENV SPARK_PART_VERSION=2.4
194 |    ENV HADOOP_PART_VERSION=2.8
195 |    
196 |    RUN mkdir -p $SPARK_BASE && tar -xzmvf spark-$SPARK_VERSION.tgz \
197 |     	&& cd spark-$SPARK_VERSION \
198 |     	&& ./build/mvn \
199 |      	-Pyarn -Phadoop-$HADOOP_PART_VERSION -Dhadoop.version=$HADOOP_VERSION \
200 |      	-Phive -Phive-thriftserver \
201 |      	-DskipTests clean package 
202 |    
203 |    # Moving Spark after build dirs to $SPARK_HOME proving to be IMPOSSIBLE!
204 |    # ENV SPARK_VERSION=2.4.4
205 |    # ENV SPARK_BASE=/opt/spark
206 |    # ENV SPARK_HOME=$SPARK_BASE/current
207 |    RUN cd /
208 |    RUN tar -cBpvzf spark-$SPARK_VERSION.tar.gz spark-$SPARK_VERSION
209 |    #RUN rm -f spark-$SPARK_BASE/$SPARK_VERSION 
210 |    RUN tar -xzvmf spark-$SPARK_VERSION.tar.gz -C $SPARK_BASE/
211 |    RUN ln -s spark-$SPARK_VERSION $SPARK_HOME \
212 |      	&& cd / 
213 |    
214 |    # Install pyspark
215 |    RUN pip3 install pyspark
216 |    
217 |    # Configuring ~hduser/.bashrc
218 |    RUN	echo "export JAVA_HOME=$JAVA_HOME" >> ~hduser/.bashrc \
219 |     	&& echo "export HADOOP_HOME=$HADOOP_HOME" >> ~hduser/.bashrc \
220 |     	&& echo "alias python='python3.6'" >> ~hduser/.bashrc \
221 |     	&& echo "alias pip='pip3'" >> ~hduser/.bashrc \
222 |     	&& echo "export PYSPARK_PYTHON='python3.6'" >> ~hduser/.bashrc \
223 |     	&& echo "export SPARK_HOME=$SPARK_HOME" >> ~hduser/.bashrc \
224 |     	&& echo "export SPARK_MAJOR_VERSION=2" >> ~hduser/.bashrc \
225 |     	&& echo "export PATH=$PATH:$HADOOP_HOME/bin:$SPARK_HOME/bin" >> ~hduser/.bashrc
226 |    
227 |    # copy script to start hadoop
228 |    COPY start-hadoop.sh /start-hadoop.sh
229 |    RUN bash start-hadoop.sh &
230 |    
231 |    # Preparing HDFS for hduser
232 |    RUN $HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/hduser
233 |    RUN $HADOOP_HOME/bin/hdfs dfs -chown hduser /user/hduser
234 |    
235 |    # Cleanup
236 |    RUN rm -f *.tar.gz *.tgz *.sh 
237 |    
238 |    # RUNNING jupyter-notebook
239 |    # as hduser ??????
240 |    
241 |    
242 |    # expose various ports
243 |    EXPOSE 8088 8888 5000 50070 50075 50030 50060
244 |    
245 |    
246 |    
247 |    
248 |    ```
249 | 
250 |    
251 | 
252 | 2. Building the image
253 | 
254 |    `docker build -t bigram-hadoop .`
255 | 
256 |    
257 | 
258 | 3. Creating the container
259 | 
260 |    ```bash
261 |    docker run  \
262 |    --network host \
263 |    --cpus=".5" \
264 |    --memory="8g" \
265 |    --name bigram-hadoop-container \
266 |    -d bigram-hadoop
267 |    ```
268 | 
269 |    
270 | 
271 | 
272 |    * **--network host**: A simple explanation is that container will inherit IP and expose all ports. **Never use this way in production! Just for developing in YOUR machine!**
273 | 
274 |    * **--cpus**=".5": enable 50% of CPUs for the container
275 | 
276 |    * **--memory**="8g": allocate max 8GB for the container
277 | 
278 |    * **--name**: sets a name for the container
279 | 
280 |    * **-d**: container on background
281 | 
282 |    * **bigram-hadoop**: name of the image
283 | 
284 |      
285 | 
286 | 
287 | 3. Running the container
288 | 
289 |    `docker run --network host --user -p8888 -p8088  hduser -it  bigram-hadoop-container jupyter-notebook`
290 | 
291 |    
292 | 
293 |    You'll see something like this:
294 | 
295 |    ```bash
296 |    WARNING: Published ports are discarded when using host network mode                                              
297 |    [I 15:36:47.100 NotebookApp] Writing notebook server cookie secret to /home/hduser/.local/share/jupyter/runtime/notebook_cookie_secret
298 |    [I 15:36:47.277 NotebookApp] Serving notebooks from local directory: /                                           
299 |    [I 15:36:47.277 NotebookApp] 0 active kernels                                                                    
300 |    [I 15:36:47.277 NotebookApp] The Jupyter Notebook is running at:                                                 
301 |    [I 15:36:47.277 NotebookApp] http://localhost:8888/?token=d940ac2eff1330843681bb360ffec84f604a3c43643723a1       
302 |    [I 15:36:47.277 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
303 |    [W 15:36:47.277 NotebookApp] No web browser found: could not locate runnable browser.                            
304 |    [C 15:36:47.277 NotebookApp]                                                                                     
305 |                                                                                                                     
306 |        Copy/paste this URL into your browser when you connect for the first time,                                   
307 |        to login with a token:                                                                                      
308 |            http://localhost:8888/?token=d940ac2eff1330843681bb360ffec84f604a3c43643723a1
309 |    ```
310 | 
311 |    
312 | 
313 |    Now, copy the 'http' address, create a new Python3 notebook and place this code at below and try to run it!
314 | 
315 |    ```python
316 |    import pyspark
317 |    import pyspark.sql.functions as F
318 |    from pyspark.sql import SparkSession
319 |    from pyspark.sql.types import *
320 |    
321 |    # start session
322 |    spark = SparkSession.builder.appName('test').enableHiveSupport().getOrCreate()
323 |    
324 |    # Setting some data
325 |    data = [["Spark","is","awsome!"]]
326 |    
327 |    # Declaring schema
328 |    schema = StructType(fields = [
329 |        StructField("col1",StringType(),True)
330 |        ,StructField("col2",StringType(),True)
331 |        ,StructField("col3",StringType(),True)
332 |    ])
333 |    
334 |    # Getting a dataframe from all of this
335 |    df = spark.createDataFrame(data,schema)
336 |    
337 |    ```
338 | 
339 |    
340 | 
341 |    
342 | 
343 | 


--------------------------------------------------------------------------------