├── .gitignore
├── Databricks Certified Associate
├── .directory
├── README.md
├── Spark dd8609c5ca3041bb89b503bc314efe53
│ ├── Databricks 16b513dab9164637bb0c8f7f9f7d0013.md
│ ├── Databricks 16b513dab9164637bb0c8f7f9f7d0013
│ │ ├── Databricks Certified Associate Developer for Apach bae0f257766344c282eecb4ede50892f.md
│ │ ├── Databricks Certified Associate Developer for Apach bae0f257766344c282eecb4ede50892f
│ │ │ ├── Untitled 1.png
│ │ │ ├── Untitled 2.png
│ │ │ └── Untitled.png
│ │ ├── Untitled 1.png
│ │ └── Untitled.png
│ ├── Untitled 1.png
│ └── Untitled.png
└── notebooks
│ ├── ETL-Part-1-1.3.1-SPNC.dbc
│ ├── LearningSparkv2.dbc
│ ├── Spark-Programming-1.0.dbc
│ └── Spark-Programming-1.5.0-IL.dbc
├── README.md
└── Spark - The Definite Guide
├── README.md
├── assets
├── banner_main.png
├── banner_session_10.png
├── banner_session_11.png
├── banner_session_12.png
├── banner_session_13.png
├── banner_session_14.png
├── banner_session_15.png
├── banner_session_16.png
├── banner_session_17.png
├── banner_session_18.png
├── banner_session_19.png
├── banner_session_20.png
├── banner_session_21.png
├── banner_session_22.png
├── banner_session_23.png
├── banner_session_24.png
├── banner_session_5.png
├── banner_session_6.png
├── banner_session_7.png
├── banner_session_8.png
├── banner_session_9.png
└── youtube.png
├── data
├── 2015-summary.json
├── customers.csv
├── customers.sql
├── secret.txt
├── simple-ml
│ ├── DatosNuevos.json
│ └── _SUCCESS
├── test
│ └── part-r-00000-f5c243b9-a015-4a3b-a4a8-eca00f80f04c.json
└── transactions.csv
└── sessions
├── session_10
└── README.md
├── session_11
└── README.md
├── session_12
└── README.md
├── session_13
└── README.md
├── session_14
└── README.md
├── session_15
├── Advanced Analytics - Demo Lab.ipynb
└── README.md
├── session_16
├── README.md
└── Text Preprocessing and Feature Extraction.ipynb
├── session_17
├── Feature Transformation.ipynb
└── README.md
├── session_18
├── Feature Selection.ipynb
├── README.md
└── assets
│ ├── 1.png
│ ├── 2.png
│ ├── 3.png
│ ├── 4.png
│ ├── 5.png
│ └── 6.png
├── session_19
└── README.md
├── session_20
├── Classification.ipynb
├── README.md
└── assets
│ ├── 1.png
│ ├── 2.png
│ └── 3.jpg
├── session_21
├── README.md
├── Recomendation System.scala
└── sample_movielens_ratings.txt
├── session_22
└── README.md
├── session_23
├── README.md
├── Spark-Chapter_30.scala
└── Spark-Chapter_30_notebook.scala
├── session_24
├── README.md
└── SparkDL.ipynb
├── session_5
├── README.md
├── basic_structured_operation.scala
└── index.html
├── session_6
├── README.md
└── chapter6.scala
├── session_7
├── Aggregations.ipynb
└── README.md
├── session_8
├── README.md
├── Spark Joins.ipynb
└── assets
│ ├── anti.png
│ ├── inner.png
│ ├── outer.png
│ └── semi.png
└── session_9
├── Data Sources and Spark SQL.ipynb
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/Databricks Certified Associate/.directory:
--------------------------------------------------------------------------------
1 | [Desktop Entry]
2 | Icon=certificate-server
3 |
--------------------------------------------------------------------------------
/Databricks Certified Associate/README.md:
--------------------------------------------------------------------------------
1 | # Spark
2 |
3 | - Architecture
4 |
5 | [Cluster Mode Overview](https://spark.apache.org/docs/latest/cluster-overview.html)
6 |
7 | - Application
8 | - User program built on Spark. *Consists of a driver program and executors on the cluster*.
9 | - Cluster
10 | - The system currently supports several cluster managers:
11 | - Standalone – a simple cluster manager included with Spark that makes it easy to set up a cluster.
12 | - Apache Mesos – a general cluster manager that can also run Hadoop MapReduce and service applications.
13 | - Hadoop YARN – the resource manager in Hadoop 2.
14 | - Kubernetes – an open-source system for automating deployment, scaling, and management of containerized applications.
15 | - Groups of nodes
16 | - Nodes are the individual machines within a cluster (generally a VM)
17 | - Databricks, the Driver (a JVM) and each executor (each a JVM) all run in their own nodes
18 |
19 | Spark Cluster → Driver + Executors
20 |
21 | - Cluster manager
22 | - An external service for acquiring resources on the cluster (e.g. standalone manager, Mesos, YARN)
23 | - Driver
24 | - Run the Spark applications
25 | - Assigns tasks to slots in a executor
26 | - coordinates the work between tasks
27 | - Receives the results, if any
28 | - Executor
29 | - JVM in a node
30 | - Provides an environment in which tasks can be run
31 | - Leverages the JVM to execute many threads (4 cores = 4 slots = 4 threads)
32 | - Jobs
33 | - A parallel computation consisting of multiple tasks that gets spawned in response to a Spark action (e.g. save, collect);
34 | - Stage
35 | - Each job gets divided into smaller sets of tasks called stages that depend on each other (similar to the map and reduce stages in MapReduce)
36 | - A stage cannot be completed until all tasks are completed
37 | - One long-running task can delay an entire stage from completing
38 | - Task/Cores/Threads
39 | - The lowest unit of parallelization
40 | - Executes a set of transformations against a partition as directed to by the driver
41 | - A **Task** is a single operation (`.map` or `.filter`) applied to a single **Partition**.
42 | - Each **Task** is executed as a single thread in an **Executor**!
43 | - If your dataset has 2 **Partitions**, an operation such as a `filter()` will trigger 2 **Tasks**, one for each **Partition**.
44 | - Slots
45 |
46 | Cores (or slots) are the number of available threads for each executor
47 |
48 |
52 |
53 | - Partition
54 | - A ~128MB chunk of the large dataset
55 | - Each task processes one and only one partition
56 | - The size and record splits are decided by the Driver
57 | - The initial size is partially adjustable with various configuration option
58 |
59 | 
60 |
61 | - DataFrames
62 | - Reader & Writer
63 |
64 | format("parquet") is by default in Spark, format("delta") is by default databricks
65 |
66 | ```scala
67 | val parqDF = spark.read.parquet("/../output/people.parquet")
68 | val parqDF = spark.read.load("/../output/people.parquet")
69 | val parqDF = spark.read.format("parquet").load("/../people.parquet")
70 |
71 | // Read a specific Parquet partition
72 | val parqDF = spark.read.parquet("/../people2.parquet/gender=M")
73 |
74 | //WRITER
75 | df.write.parquet("/tmp/output/people.parquet")
76 | df.write.mode("append").option("compression", "snappy").parquet("/../people.parquet")
77 | df.write.partitionBy("gender","salary").parquet("/../../people2.parquet")
78 |
79 | df.write.format("parquet").save("/tmp/output/people.parquet")
80 | df.write.save("/tmp/output/people.parquet")
81 | df.write.format("parquet").mode("append").option("compression", "snappy").save("/../people.parquet")
82 | df.write.format("parquet").partitionBy("gender","salary").save("/../../people2.parquet")
83 | ```
84 |
85 | ```scala
86 | // Read one file or directory
87 | val df = spark.read.csv("./zipcodes.csv")
88 | val df = spark.read.format("csv").load("./zipcodes.csv")
89 |
90 | // Read multiple files
91 | val df = spark.read.csv("path1,path2,path3")
92 |
93 | //Read multi options
94 | val df = spark.read.option("sep", "\t")
95 | .option("header", true)
96 | .option("inferSchema", true) // .schema(userDefinedSchema)
97 | .csv(usersCsvPath)
98 |
99 | ```
100 |
101 | ```scala
102 | //read json file into dataframe
103 | val df = spark.read.json("./zipcodes.json")
104 | val df = spark.read.format("json").load("./zipcodes.json")
105 |
106 | //read multiline json file
107 | val multiline_df = spark.read.option("multiline","true").json("./multiline.json")
108 |
109 | // Read multiple files
110 | val df = spark.read.json("path1,path2,path3")
111 |
112 | //WRITING
113 | df.write.format("csv").mode("overwrite").option("header","true").save("/tmp/output/df_csv")
114 | ```
115 |
116 | ```scala
117 | df.write.mode("overwrite").saveAsTable("TableName")
118 | ```
119 |
120 | ```sql
121 | CREATE OR REPLACE TEMPORARY VIEW my_view_name
122 | USING parquet
123 | OPTION (path "/../../../file.parquet")
124 | ```
125 |
126 | Best Practice: Write Results to a Delta Table
127 |
128 | ```scala
129 | eventsDF.write.format("delta").mode("overwrite").save(eventsOutputPath)
130 | data.write.format("delta").mode("overwrite").save("/tmp/delta-table")
131 |
132 | //Read
133 | val df = spark.read.format("delta").load("/tmp/delta-table")
134 |
135 | //
136 | val streamingDf = spark.readStream.format("rate").load()
137 | ```
138 |
139 | - Schema
140 |
141 | You can use the **StructType** Scala method **toDDL** to have a DDL-formatted string created for you.
142 |
143 | ```scala
144 | usersDF.printSchema() // print schema
145 | usersDF.schema // print StructureType
146 |
147 | // Read schema to DDL
148 | val DDLSchema = spark.read.parquet("/../events/events.parquet").schema.toDDL
149 |
150 | val eventsDF = spark.read.schema(DDLSchema).json(eventsJsonPath)
151 | ```
152 |
153 | ```scala
154 | import org.apache.spark.sql.types.{ArrayType, DoubleType, IntegerType, LongType, StringType, StructType, StructField}
155 |
156 | val userDefinedSchema = StructType( Seq(
157 | StructField("user_id", StringType, true),
158 | StructField("email", StringType, true)
159 | ) )
160 | ```
161 |
162 | - DataFrame & Column
163 |
164 | A column is a logical construction that will be computed based on the data in a DataFrame using an expression
165 |
166 | ```scala
167 | col("device")
168 | $"device"
169 | eventsDF("device")
170 | ```
171 |
172 | `select()`
173 |
174 | ```scala
175 | import org.apache.spark.sql.functions.{col,lit}
176 | val df = eventsDF.select("user_id", "device")
177 | val df2 = eventsDF.select(col("user_id"),col("geo.city").alias("city"),lit("a lit"))
178 | ```
179 |
180 | New Column `selectExpr()`
181 |
182 | ```scala
183 | val appleDF = eventsDF.selectExpr("user_id", "device in ('iOS') as apple_user")
184 | val df= eventsDF.selectExpr("user_id", "3+1 as newColumn")
185 | ```
186 |
187 | Drop a column or columns `drop()`
188 |
189 | ```scala
190 | val anonymousDF = eventsDF.drop("user_id", "geo", "device")
191 | val noSalesDF = eventsDF.drop(col("ecommerce"))
192 | ```
193 |
194 | Add or replace columns `withColumn()` `withColumnRenamed()`
195 |
196 | ```scala
197 | val df = eventsDF.withColumn("newColumnName", col("device").isin("iOS", "Android"))
198 | val df = eventsDF.withColumn("replaceColumnName", col("ecommerce.quantity").cast("int"))
199 |
200 | //Rename Column
201 | val df = eventsDF.withColumnRenamed("oldName", "newName")
202 | ```
203 |
204 | `dropDuplicates()`
205 |
206 | ```scala
207 | val df = eventsDF.dropDuplicates(Seq("device", "traffic_source"))
208 | val df2 = df.dropDuplicates()
209 | val df3 = df.distinct()
210 | ```
211 |
212 | `limit( )` new DataFrame by taking the **first n rows**. similar to top(10)
213 |
214 | ```scala
215 | val limitDF = eventsDF.limit(100)
216 | ```
217 |
218 | `sort()` Returns a new DataFrame sorted by. alias `oserBy()`
219 |
220 | ```scala
221 | val df = eventsDF.orderBy("touch_timestamp", "event_timestamp")
222 | val df = eventsDF.sort(col("touch_timestamp").desc, col("event_timestamp"))
223 | ```
224 |
225 | - Aggregation
226 |
227 | `groupBy( )` to create a grouped data object This grouped data object is called `RelationalGroupedDataset` in **Scala** and `GroupedData` in **Python**
228 |
229 | ```scala
230 | df.groupBy("event_name") //RelationalGroupedDataset
231 | val eventCountsDF = df.groupBy("event_name").count()
232 | val df= df.groupBy("geo.state", "geo.city").sum("ecommerce.quantity")
233 | ```
234 |
235 | `agg` to apply built-in aggregate functions Built-in aggregate functions
236 |
237 | ```scala
238 | import org.apache.spark.sql.functions.{avg, approx_count_distinct, sum}
239 | val df= df.groupBy("geo.state").agg(sum("ecommerce.quantity").alias("purchases"))
240 |
241 | val stateAggregatesDF = df.groupBy("geo.state").agg(
242 | avg("ecommerce.quantity").alias("avg_quantity"),
243 | approx_count_distinct("user_id").alias("distinct_users"))
244 | ```
245 |
246 | - Datetime Functions
247 |
248 | `cast()` Casts column to a different data type, specified using string representation or DataType
249 |
250 | ```scala
251 | val df = df.withColumn("timestamp", (col("timestamp") / 1e6).cast("timestamp"))
252 | ```
253 |
254 | `date_format()` Converts a date/timestamp/string to a **string** formatted from a given *date/timestamp/string*
255 |
256 | ```scala
257 | import org.apache.spark.sql.functions.date_format
258 |
259 | val df = timestampDF
260 | .withColumn("date string", date_format(col("timestamp"), "MMMM dd, yyyy"))
261 | .withColumn("time string", date_format(col("timestamp"), "HH:mm:ss.SSSSSS"))
262 | ```
263 |
264 | `year()`, `month()`, `dayofweek()`, `minute()`, `second()` Extracts the year as an **integer** from a given *date/timestamp/string*.
265 |
266 | ```scala
267 | import org.apache.spark.sql.functions.{year, month, dayofweek, minute, second}
268 |
269 | val datetimeDF = timestampDF
270 | .withColumn("year", year(col("timestamp")))
271 | .withColumn("month", month(col("timestamp")))
272 | .withColumn("dayofweek", dayofweek(col("timestamp")))
273 | .withColumn("minute", minute(col("timestamp")))
274 | .withColumn("second", second(col("timestamp")))
275 | ```
276 |
277 | `to_date()` Converts the column into DateType by casting rules to DateType from timestamp.
278 |
279 | ```scala
280 | import org.apache.spark.sql.functions.to_date
281 | val dateDF = timestampDF.withColumn("date", to_date(col("timestamp")))
282 | ```
283 |
284 | `date_add( col("columnName") , int)` Returns the date that is the given number of days after start
285 |
286 | ```scala
287 | import org.apache.spark.sql.functions.date_add
288 | val df = timestampDF.withColumn("plus_twoDays", date_add(col("timestamp"), 2))
289 | ```
290 |
291 | - Complex Types
292 |
293 | extract fields
294 |
295 | ```scala
296 | import org.apache.spark.sql.functions._
297 |
298 | val detailsDF = df.withColumn("items", explode(col("items")))
299 | .select("email", "items.item_name") // one field
300 | .withColumn("details", split(col("item_name"), " ")) // array type
301 |
302 | ```
303 |
304 | `array_contains(col("column"), "word")` search a String into an array
305 |
306 | `element_at(col("column"), [index])` col("arrayColumn"), [index Integer]
307 |
308 | `filter( )` returned a value if the condition is True
309 |
310 | ```scala
311 | val mattressDF = detailsDF.filter(array_contains(col("details"), "Mattress"))
312 | .withColumn("size", element_at(col("details"), 2))
313 | ```
314 |
315 | `df_a.unionByName(df_b)` union two DataFrames
316 |
317 | ```scala
318 | val unionDF = mattressDF.unionByName(pillowDF).drop("details")
319 | ```
320 |
321 | `agg( collect_set("col") )` returned an Array set
322 |
323 | ```scala
324 | val optionsDF = unionDF.groupBy("email")
325 | .agg(collect_set("size").alias("size options")
326 | )
327 | ```
328 |
329 | - Laziness
330 |
331 | For large datasets, even a basic transformation will take millions of operations to execute. All you need to do is tell Spark what are the transformations you want to do on the dataset and Spark will maintain a series of transformations. When you ask for the results from Spark, it will then find out the best path and perform the required transformations and give you the result.
332 |
333 | 
334 |
335 | - [AQE]
336 |
337 | [How to Speed up SQL Queries with Adaptive Query Execution](https://databricks.com/blog/2020/05/29/adaptive-query-execution-speeding-up-spark-sql-at-runtime.html)
338 |
339 | - Partitioning
340 |
341 | A partition in spark is an atomic chunk of data (logical division of data) stored on a node in the cluster. Partitions are basic units of parallelism in Apache Spark.
342 |
343 | - Shuffling
344 |
345 | A shuffle occurs when data is rearranged between **partitions**. This is required when a transformation requires information from other partitions, such as summing all the values in a column. Spark will gather the required data from each partition and combine it into a new partition, likely on a different executor.
346 |
347 | During a shuffle, data is written to disk and transferred across the network, halting Spark’s ability to do processing in-memory and causing a performance bottleneck. Consequently we want to try to reduce the number of shuffles being done or reduce the amount of data being shuffled.
348 |
349 | 
350 |
351 | - Parallelism
352 | - to increase parallelism of spark processing is to increase the number of executors on the cluster
353 | - Wide and Narrow Transformation
354 |
355 | Transformations is a kind of process that will transform your RDD data from one form to another in Spark. and when you apply this operation on an RDD, you will get a new RDD with transformed data (RDDs in Spark are immutable).
356 |
357 | - Narrow Transformations
358 |
359 | These types of transformations convert each input partition to only one output partition. When each partition at the parent RDD is used by at most one partition of the child RDD or when each partition from child produced or dependent on single parent RDD.
360 |
361 | - This kind of transformation is basically fast.
362 | - Does not require any data shuffling over the cluster network or no data movement.
363 | - Operation of `map()` and `filter()` belongs to this transformations.
364 |
365 | 
366 |
367 | - Wide Transformations
368 |
369 | This type of transformation will have input partitions contributing to many output partitions. When each partition at the parent RDD is used by multiple partitions of the child RDD or when each partition from child produced or dependent on multiple parent RDD.
370 |
371 | - required to shuffle data around different nodes when creating new partitions
372 | - Functions such as `groupByKey()`, `aggregateByKey()`, `aggregate()`, `join()`, `repartition()` are some examples of wider transformations.
373 | - Broadcast variable
374 | - Broadcast join
375 |
376 |
377 |
378 | [Databricks](Spark%20dd8609c5ca3041bb89b503bc314efe53/Databricks%2016b513dab9164637bb0c8f7f9f7d0013.md)
--------------------------------------------------------------------------------
/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 |
3 | - Databricks SQL
4 |
5 | Is a Databricks environment designed with SQL analysts in mind. You can use the built-in SQL query editor to write highly-performant queries directly on your organization's data lake, so you can be sure that you are always working with the most complete and current information available.
6 |
7 | - You can also connect to your preferred business intelligence (BI) tools, and use Databricks SQL to power your queries for fast performance.
8 | - you can track KPIs with automatic alerts or dashboard refresh on the latest data
9 |
10 | 
11 |
12 | 
13 |
14 | in order to start writing queries, you will need a Databricks Administrator to set up two things:
15 |
16 |
17 | [Databricks Certified Associate Developer for Apache Spark Certification](Databricks%2016b513dab9164637bb0c8f7f9f7d0013/Databricks%20Certified%20Associate%20Developer%20for%20Apach%20bae0f257766344c282eecb4ede50892f.md)
--------------------------------------------------------------------------------
/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Databricks Certified Associate Developer for Apach bae0f257766344c282eecb4ede50892f.md:
--------------------------------------------------------------------------------
1 | # Databricks Certified Associate Developer for Apache Spark Certification
2 |
3 | [Databricks Certified Associate Developer for Apache Spark 3.0](https://academy.databricks.com/exam/databricks-certified-associate-developer)
4 |
5 | - **Topics**
6 | - **Basics of the Apache Spark Architecture**: The architecture of Apache Spark is detailed by its nature as a cluster-computing framework. It describes how data is partitioned, processed, etc.
7 | - **Basics of the Apache Spark DataFrame API**: ****The Spark DataFrame is the fundamental user-facing data structure of Apache Spark. Its API is used to manipulate data using common data manipulation terminology.
8 |
9 | 
10 |
11 | - Architecture
12 | - Cluster architecture: nodes, drivers, workers, executors, slots, etc.
13 | - Spark execution hierarchy: applications, jobs, stages, tasks, etc.
14 | - Shuffling
15 | - Partitioning
16 | - Lazy evaluation
17 | - Transformations vs. actions
18 | - Narrow vs. wide transformations
19 | - Architecture Application
20 | - Execution deployment modes
21 | - Stability
22 | - Garbage collection
23 | - Out-of-memory errors
24 | - Storage levels
25 | - Repartitioning
26 | - Coalescing
27 | - Broadcasting
28 | - DataFrames
29 | - DataFrame API
30 | - Subsetting DataFrames (select, filter, etc.)
31 | - Column manipulation (casting, creating columns, manipulating existing columns, complex column types)
32 | - String manipulation (Splitting strings, regular expressions)
33 | - Performance-based operations (repartitioning, shuffle partitions, caching)
34 | - Combining DataFrames (joins, broadcasting, unions, etc.)
35 | - Reading/writing DataFrames (schemas, overwriting)
36 | - Working with dates (extraction, formatting, etc.)
37 | - Aggregations
38 | - Miscellaneous (sorting, missing values, typed UDFs, value extraction, sampling)
39 |
40 | - **Question overview**
41 |
42 | There are 60 total questions on this exam. All of the questions are multiple-choice questions with five options - one correct answer and four distractors.
43 |
44 | 
45 |
46 | - **Preparation**
47 | - In addition, Sections I, II, and IV of *[Spark: The Definitive Guide](https://www.oreilly.com/library/view/spark-the-definitive/9781491912201/)* and Chapters 1-7 of *[Learning Spark](https://www.oreilly.com/library/view/learning-spark-2nd/9781492050032/)* should also be helpful in preparation.
48 | - Before taking the exam, it is recommended that you complete the practice exam for your language of choice: *[Python](https://files.training.databricks.com/assessments/practice-exams/PracticeExam-DCADAS3-Python.pdf)* or *[Scala](https://files.training.databricks.com/assessments/practice-exams/PracticeExam-DCADAS3-Scala.pdf)*.
49 | - To prepare for this exam, we recommend the instructor-led courses Apache Spark Programming with Databricks or DB 105 - Apache Spark Programming.
50 | - **Resources**
51 | - Videos
52 | - Udemy
53 |
54 |
55 | [Databricks Fundamentals & Apache Spark Core](https://www.udemy.com/course/databricks-fundamentals-apache-spark-core/)
56 |
57 | [Apache Spark 3 - Databricks Certified Associate Developer](https://www.udemy.com/course/apache-spark-3-databricks-certified-associate-developer/)
58 |
59 | [Databricks Certified Developer for Spark 3.0 Practice Exams](https://www.udemy.com/course/databricks-certified-developer-for-apache-spark-30-practice-exams/)
60 |
61 |
62 | [https://www.youtube.com/watch?v=_C8kWso4ne4](https://www.youtube.com/watch?v=_C8kWso4ne4)
63 |
64 | [https://www.youtube.com/watch?v=CF5Ewk0GxiQ](https://www.youtube.com/watch?v=CF5Ewk0GxiQ)
65 |
66 | [https://www.youtube.com/watch?v=qEKfyoOUKb8&t=649s](https://www.youtube.com/watch?v=qEKfyoOUKb8&t=649s)
67 |
68 | [https://www.youtube.com/watch?v=daXEp4HmS-E](https://www.youtube.com/watch?v=daXEp4HmS-E)
69 |
70 | [https://www.youtube.com/watch?v=ywPuZ_WrHT0](https://www.youtube.com/watch?v=ywPuZ_WrHT0)
71 |
72 | - Drive
73 |
74 | [https://drive.google.com/drive/folders/1wVEY00BseyNZ6GSZ45KDJnauPK_uMYU3?usp=sharing](https://drive.google.com/drive/folders/1wVEY00BseyNZ6GSZ45KDJnauPK_uMYU3?usp=sharing)
75 |
76 | - Medium
77 |
78 | [Databricks Certified Associate Developer for Apache Spark - tips to get prepared for the exam](https://medium.com/data-arena/databricks-certified-associate-developer-for-apache-spark-tips-to-get-prepared-for-the-exam-cf947795065b)
79 |
80 | [10 Exercises To Practice Before Your Databricks Apache Spark 3.0 Developer Examm](https://towardsdatascience.com/10-mcqs-to-practice-before-your-databricks-apache-spark-3-0-developer-exam-bd886060b9ab)
81 |
82 | [Preparing and Testing your knowledge of Databricks Apache Spark 3.0 certification (Scala & Python)](https://medium.com/@mertozer94/preparing-and-testing-your-knowledge-of-databricks-apache-spark-3-0-certification-scala-python-e3605284b555)
83 |
84 | [Study guide for clearing "Databricks Certified Associate Developer for Apache Spark 3.0"](https://shrutibhawsar94.medium.com/study-guide-for-clearing-databricks-certified-associate-developer-for-apache-spark-3-0-69377dba0107)
85 |
86 | [Crack Databricks Certified Associate Developer for Apache Spark 3.0-](https://medium.com/@sriramn84_34423/crack-databricks-certified-associate-developer-for-apache-spark-3-0-cf4cb89df61d)
87 |
88 | [Databricks Certified Associate Developer for Apache Spark 3.0: How to?](https://medium.com/codex/databricks-certified-associate-developer-for-apache-spark-3-0-how-to-ba95707eca79)
89 |
90 | [Beginner's Guide to Crack Databricks Certified Associate Developer for Apache Spark 3.0-](https://medium.com/@blackhat1729/beginners-guide-to-crack-databricks-certified-associate-developer-for-apache-spark-3-0-7c1aad2a578b)
91 |
92 | - Microsoft
93 |
94 | [Data engineering with Azure Databricks - Learn](https://docs.microsoft.com/en-us/learn/paths/data-engineer-azure-databricks/)
95 |
96 | - Github
97 |
98 | [GitHub - DanielEdu/Databricks-academy](https://github.com/DanielEdu/Databricks-academy)
99 |
100 | [GitHub - databricks/LearningSparkV2: This is the github repo for Learning Spark: Lightning-Fast Data Analytics [2nd Edition]](https://github.com/databricks/LearningSparkV2)
101 |
102 | - Miro
103 |
104 | [https://miro.com/app/board/o9J_l4ECPow=/](https://miro.com/app/board/o9J_l4ECPow=/)
105 |
106 |
107 | 
--------------------------------------------------------------------------------
/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Databricks Certified Associate Developer for Apach bae0f257766344c282eecb4ede50892f/Untitled 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Databricks Certified Associate Developer for Apach bae0f257766344c282eecb4ede50892f/Untitled 1.png
--------------------------------------------------------------------------------
/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Databricks Certified Associate Developer for Apach bae0f257766344c282eecb4ede50892f/Untitled 2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Databricks Certified Associate Developer for Apach bae0f257766344c282eecb4ede50892f/Untitled 2.png
--------------------------------------------------------------------------------
/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Databricks Certified Associate Developer for Apach bae0f257766344c282eecb4ede50892f/Untitled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Databricks Certified Associate Developer for Apach bae0f257766344c282eecb4ede50892f/Untitled.png
--------------------------------------------------------------------------------
/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Untitled 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Untitled 1.png
--------------------------------------------------------------------------------
/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Untitled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Databricks 16b513dab9164637bb0c8f7f9f7d0013/Untitled.png
--------------------------------------------------------------------------------
/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Untitled 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Untitled 1.png
--------------------------------------------------------------------------------
/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Untitled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Databricks Certified Associate/Spark dd8609c5ca3041bb89b503bc314efe53/Untitled.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Spark Study Club
2 |
3 | Bienvenidos al repositorio del grupo de estudios de Spark de Data Engineering Latam.
4 |
5 | Mayor información aquí: [Telegram](https://t.me/dataEngineeringLatam_Spark 'Telegram')
6 |
7 | ## Sobre el grupo
8 |
9 | El **Spark Study Club** de Data Engineering Latam está tomando la iniciativa de llevar sesiones de estudios sobre sobre diversos temas para que sus miembros puedan familiarizarse con [Apache Spark](https://spark.apache.org), la herramienta de analítica unificada, multi lenguage y distribuída que forma parte de *The Apache Software Foundation* y que es utilizada para ejecutar tareas de ingeniería de datos y de ciencia de datos a gran escala.
10 |
11 | ## Sobre el repositorio
12 |
13 | Este repositorio acomoda los materiales utilizados en las sesiones de estudio sobre Apache Spark llevadas a cabo por Data Engineering Latam.
14 |
15 | * [Databricks Certified Associate](https://github.com/DataEngineering-LATAM/Spark-StudyClub/tree/main/Databricks%20Certified%20Associate)
16 | * [Spark - The Definite Guide](https://github.com/DataEngineering-LATAM/Spark-StudyClub/tree/main/Spark%20-%20The%20Definite%20Guide)
17 |
18 | Instrucciones para utilizar el repositorio estarán dentro de las respectivas carpetas en `root`.
19 |
20 | ---
21 |
22 | ## Sobre la comunidad Data Engineering Latam
23 |
24 | Data Engineering Latam es la comunidad de datos más grande de América Latina cuya misión es promover el talento de la región a través de la difusión de charlas, talleres, grupos de estudio, ayuda colaborativa y la creación de contenido relevante.
25 |
26 |
27 |
28 |
29 |
30 | ## Síguenos en nuestras redes oficiales
31 |
32 | Todas y cada una de nuestras iniciativas y contenidos se mantienen sin apoyo de terceros. Si quieres vernos crecer, nos puedes ayudar con tus reacciones, comentarios y compartidas de nuestros contenidos en redes sociales 🥹
33 |
34 | - [YouTube](https://youtube.com/c/dataengineeringlatam?sub_confirmation=1)
35 | - [Medium](https://medium.com/@dataengineeringlatam)
36 | - [Twitter](https://twitter.com/DataEngiLatam)
37 | - [Instagram](https://instagram.com/dataengineeringlatam)
38 | - [Facebook](https://facebook.com/dataengineeringlatam)
39 | - [TikTok](https://www.tiktok.com/@dataengineeringlatam)
40 | - [Slack](https://bit.ly/dataengineeringlatam_slack)
41 | - [Telegram](https://t.me/dataengineeringlatam)
42 | - [Linkedin](https://linkedin.com/company/data-engineering-latam)
43 |
44 | ## ¿Quieres dar charla en la comunidad?
45 |
46 | :microphone: Cuéntanos [aquí](https://docs.google.com/forms/d/e/1FAIpQLSd7CZgRxGHx-rRA7CyAeB0MxNPgVj5rCqQsrjrFiNYhoZxS1w/viewform)
47 |
48 | ## Disclaimer
49 |
50 | Este no es un curso, los ponentes no son profesores y tú no eres un alumno. Todos estamos aquí reunidos porque nos apasiona este campo. Si algún ponente propone ejercicios a resolver, no estás obligado a presentarlos (ni nosotros a corregirlos =)
51 |
52 | ¡Cualquier feedback que tengas, siempre con respeto, es bienvenido!
53 |
54 | ## ¿Cómo aprovechar mejor esta iniciativa?
55 |
56 | Se recomienda compartir tu resumen a manera de slides, notion, canva, artículo en Medium, post en redes sociales o todo lo antes mencionado utilizando el #dataengineeringlatam y etiquetándonos.
57 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # Spark - The Definite Guide
4 |
5 | En este repositorio se encuentran los materiales de las sesiones sobre Apache Spark llevadas a cabo por la comunidad de Data Engineering LATAM.
6 |
7 | ## Índice
8 |
9 | #### Parte II. APIs Estructurados - DataFrames, SQL y Datasets
10 | * [Sesión 5 - Operaciones estructuradas básicas](sessions/session_5)
11 | * [Sesión 6 - Trabajando con diferentes tipos de datos](sessions/session_6)
12 | * [Sesión 7 - Aggregations](sessions/session_7)
13 | * [Sesión 8 - Joins](sessions/session_8)
14 | * [Sesión 9 - Data Sources y Spark SQL](sessions/session_9)
15 |
16 | #### Parte III. APIs de bajo nivel
17 | * [Sesión 10 - Datasets, RDDs y Variables Distribuidas Compartidas](sessions/session_10)
18 |
19 | #### Parte IV. Aplicaciones en Producción
20 | * [Sesión 11 - Aplicaciones en Producción (parte 1)](sessions/session_11)
21 | * [Sesión 12 - Aplicaciones en Producción (parte 2) con Spark en Kubernetes](sessions/session_12)
22 |
23 | #### Parte V. Streaming
24 | * [Sesión 13 - Fundamentos de Structured Streaming](sessions/session_13)
25 | * [Sesión 14 - Event-Time y Stateful Processing](sessions/session_14)
26 |
27 | #### Parte VI. Analítica Avanzada y Machine Learning
28 | * [Sesión 15 - Introducción a la Analítica Avanzada y Machine Learning con Spark ML](sessions/session_15)
29 | * [Sesión 16 - Preprocesamiento y extracción de textos con Spark ML](sessions/session_16)
30 | * [Sesión 17 - Preprocesamiento de datos continuos y categóricos con Spark ML](sessions/session_17)
31 | * [Sesión 18 - Selección y manipulación de características con Spark ML](sessions/session_18)
32 | * [Sesión 19 - Modelos de regresión con Spark Machine Learning](sessions/session_19)
33 | * [Sesión 20 - Modelos de clasificación y ajuste de hiperparámetros con Spark ML](sessions/session_20)
34 | * [Sesión 21 - Modelos de recomendación con Spark Machine Learning](sessions/session_21)
35 | * [Sesión 22 - Modelos de aprendizaje no supervisado con Spark Machine Learning](sessions/session_22)
36 | * [Sesión 23 - Analítica de Grafos con GraphX](sessions/session_23)
37 | * [Sesión 24 - Deep Learning en Apache Spark](sessions/session_24)
38 |
39 |
40 | ## Sobre "Spark - The Definite Guide"
41 | **Spark: The Definite Guide** es un grupo de estudio que se creó para estudiar Apache Spark desde cero. Está basado en el libro denominado "Spark - The Definite Guide", co-escrito por el creador original de Apache Spark, Matei Zaharia, y por el Bill Chambers, ambos desempeñandose en Databricks. El libro está dirigido principalmente a los científicos e ingenieros de datos que desean dar los primeros pasos con Apache Spark.
42 |
43 | El Spark Study Club de Data Engineering LATAM ha tomado la iniciativa de llevar sesiones de estudios sobre el libro para que sus miembros puedan familiarizarse con la herramienta. Estas sesiones fueron grabadas, y los materiales utilizados durante las sesiones se encuentran en este repositorio.
44 |
45 | * Link para adquirir el libro en Amazon: [Spark: The Definitive Guide: Big Data Processing Made Simple](https://www.amazon.com/Spark-Definitive-Guide-Processing-Simple/dp/1491912219)
46 | * Repositorio oficial: https://github.com/databricks/Spark-The-Definitive-Guide
47 |
48 | ## Sobre la comunidad de Data Engineering LATAM
49 | Data Engineering LATAM es la comunidad de datos más grande de América Latina cuya misión es promover el talento de la región a través de la difusión de charlas, talleres, grupos de estudio, ayuda colaborativa y la creación de contenido relevante.
50 |
51 | ## Nuestras redes sociales
52 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
53 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
54 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
55 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
56 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_main.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_10.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_11.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_12.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_13.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_14.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_15.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_16.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_17.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_18.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_19.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_20.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_21.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_22.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_23.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_24.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_5.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_6.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_7.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_8.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/banner_session_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/banner_session_9.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/assets/youtube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/assets/youtube.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/data/2015-summary.json:
--------------------------------------------------------------------------------
1 | {"ORIGIN_COUNTRY_NAME":"Romania","DEST_COUNTRY_NAME":"United States","count":15}
2 | {"ORIGIN_COUNTRY_NAME":"Croatia","DEST_COUNTRY_NAME":"United States","count":1}
3 | {"ORIGIN_COUNTRY_NAME":"Ireland","DEST_COUNTRY_NAME":"United States","count":344}
4 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Egypt","count":15}
5 | {"ORIGIN_COUNTRY_NAME":"India","DEST_COUNTRY_NAME":"United States","count":62}
6 | {"ORIGIN_COUNTRY_NAME":"Singapore","DEST_COUNTRY_NAME":"United States","count":1}
7 | {"ORIGIN_COUNTRY_NAME":"Grenada","DEST_COUNTRY_NAME":"United States","count":62}
8 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Costa Rica","count":588}
9 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Senegal","count":40}
10 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Moldova","count":1}
11 | {"ORIGIN_COUNTRY_NAME":"Sint Maarten","DEST_COUNTRY_NAME":"United States","count":325}
12 | {"ORIGIN_COUNTRY_NAME":"Marshall Islands","DEST_COUNTRY_NAME":"United States","count":39}
13 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Guyana","count":64}
14 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Malta","count":1}
15 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Anguilla","count":41}
16 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Bolivia","count":30}
17 | {"ORIGIN_COUNTRY_NAME":"Paraguay","DEST_COUNTRY_NAME":"United States","count":6}
18 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Algeria","count":4}
19 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Turks and Caicos Islands","count":230}
20 | {"ORIGIN_COUNTRY_NAME":"Gibraltar","DEST_COUNTRY_NAME":"United States","count":1}
21 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Saint Vincent and the Grenadines","count":1}
22 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Italy","count":382}
23 | {"ORIGIN_COUNTRY_NAME":"Federated States of Micronesia","DEST_COUNTRY_NAME":"United States","count":69}
24 | {"ORIGIN_COUNTRY_NAME":"Russia","DEST_COUNTRY_NAME":"United States","count":161}
25 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Pakistan","count":12}
26 | {"ORIGIN_COUNTRY_NAME":"Netherlands","DEST_COUNTRY_NAME":"United States","count":660}
27 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Iceland","count":181}
28 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Marshall Islands","count":42}
29 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Luxembourg","count":155}
30 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Honduras","count":362}
31 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"The Bahamas","count":955}
32 | {"ORIGIN_COUNTRY_NAME":"Senegal","DEST_COUNTRY_NAME":"United States","count":42}
33 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"El Salvador","count":561}
34 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Samoa","count":25}
35 | {"ORIGIN_COUNTRY_NAME":"Angola","DEST_COUNTRY_NAME":"United States","count":13}
36 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Switzerland","count":294}
37 | {"ORIGIN_COUNTRY_NAME":"Anguilla","DEST_COUNTRY_NAME":"United States","count":38}
38 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Sint Maarten","count":325}
39 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Hong Kong","count":332}
40 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Trinidad and Tobago","count":211}
41 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Latvia","count":19}
42 | {"ORIGIN_COUNTRY_NAME":"Ecuador","DEST_COUNTRY_NAME":"United States","count":300}
43 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Suriname","count":1}
44 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Mexico","count":7140}
45 | {"ORIGIN_COUNTRY_NAME":"Cyprus","DEST_COUNTRY_NAME":"United States","count":1}
46 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Ecuador","count":268}
47 | {"ORIGIN_COUNTRY_NAME":"Portugal","DEST_COUNTRY_NAME":"United States","count":134}
48 | {"ORIGIN_COUNTRY_NAME":"Costa Rica","DEST_COUNTRY_NAME":"United States","count":608}
49 | {"ORIGIN_COUNTRY_NAME":"Guatemala","DEST_COUNTRY_NAME":"United States","count":318}
50 | {"ORIGIN_COUNTRY_NAME":"Suriname","DEST_COUNTRY_NAME":"United States","count":34}
51 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Colombia","count":873}
52 | {"ORIGIN_COUNTRY_NAME":"Cape Verde","DEST_COUNTRY_NAME":"United States","count":14}
53 | {"ORIGIN_COUNTRY_NAME":"Jamaica","DEST_COUNTRY_NAME":"United States","count":712}
54 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Norway","count":121}
55 | {"ORIGIN_COUNTRY_NAME":"Malaysia","DEST_COUNTRY_NAME":"United States","count":3}
56 | {"ORIGIN_COUNTRY_NAME":"Morocco","DEST_COUNTRY_NAME":"United States","count":19}
57 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Thailand","count":3}
58 | {"ORIGIN_COUNTRY_NAME":"Samoa","DEST_COUNTRY_NAME":"United States","count":25}
59 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Venezuela","count":290}
60 | {"ORIGIN_COUNTRY_NAME":"Palau","DEST_COUNTRY_NAME":"United States","count":31}
61 | {"ORIGIN_COUNTRY_NAME":"Venezuela","DEST_COUNTRY_NAME":"United States","count":246}
62 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Panama","count":510}
63 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Antigua and Barbuda","count":126}
64 | {"ORIGIN_COUNTRY_NAME":"Chile","DEST_COUNTRY_NAME":"United States","count":185}
65 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Morocco","count":15}
66 | {"ORIGIN_COUNTRY_NAME":"Finland","DEST_COUNTRY_NAME":"United States","count":28}
67 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Azerbaijan","count":21}
68 | {"ORIGIN_COUNTRY_NAME":"Greece","DEST_COUNTRY_NAME":"United States","count":23}
69 | {"ORIGIN_COUNTRY_NAME":"The Bahamas","DEST_COUNTRY_NAME":"United States","count":986}
70 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"New Zealand","count":111}
71 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Liberia","count":2}
72 | {"ORIGIN_COUNTRY_NAME":"Hong Kong","DEST_COUNTRY_NAME":"United States","count":414}
73 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Hungary","count":2}
74 | {"ORIGIN_COUNTRY_NAME":"China","DEST_COUNTRY_NAME":"United States","count":920}
75 | {"ORIGIN_COUNTRY_NAME":"Vietnam","DEST_COUNTRY_NAME":"United States","count":2}
76 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Burkina Faso","count":1}
77 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Sweden","count":118}
78 | {"ORIGIN_COUNTRY_NAME":"Kuwait","DEST_COUNTRY_NAME":"United States","count":28}
79 | {"ORIGIN_COUNTRY_NAME":"Dominican Republic","DEST_COUNTRY_NAME":"United States","count":1420}
80 | {"ORIGIN_COUNTRY_NAME":"Egypt","DEST_COUNTRY_NAME":"United States","count":12}
81 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Israel","count":134}
82 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"United States","count":370002}
83 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Ethiopia","count":13}
84 | {"ORIGIN_COUNTRY_NAME":"Luxembourg","DEST_COUNTRY_NAME":"United States","count":134}
85 | {"ORIGIN_COUNTRY_NAME":"Poland","DEST_COUNTRY_NAME":"United States","count":33}
86 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Martinique","count":44}
87 | {"ORIGIN_COUNTRY_NAME":"Saint Barthelemy","DEST_COUNTRY_NAME":"United States","count":41}
88 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Saint Barthelemy","count":39}
89 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Barbados","count":154}
90 | {"ORIGIN_COUNTRY_NAME":"Turkey","DEST_COUNTRY_NAME":"United States","count":129}
91 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Djibouti","count":1}
92 | {"ORIGIN_COUNTRY_NAME":"Azerbaijan","DEST_COUNTRY_NAME":"United States","count":21}
93 | {"ORIGIN_COUNTRY_NAME":"Estonia","DEST_COUNTRY_NAME":"United States","count":1}
94 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Germany","count":1468}
95 | {"ORIGIN_COUNTRY_NAME":"South Korea","DEST_COUNTRY_NAME":"United States","count":827}
96 | {"ORIGIN_COUNTRY_NAME":"El Salvador","DEST_COUNTRY_NAME":"United States","count":508}
97 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Ireland","count":335}
98 | {"ORIGIN_COUNTRY_NAME":"Hungary","DEST_COUNTRY_NAME":"United States","count":3}
99 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Zambia","count":1}
100 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Malaysia","count":2}
101 | {"ORIGIN_COUNTRY_NAME":"Ethiopia","DEST_COUNTRY_NAME":"United States","count":12}
102 | {"ORIGIN_COUNTRY_NAME":"Panama","DEST_COUNTRY_NAME":"United States","count":465}
103 | {"ORIGIN_COUNTRY_NAME":"Aruba","DEST_COUNTRY_NAME":"United States","count":342}
104 | {"ORIGIN_COUNTRY_NAME":"Thailand","DEST_COUNTRY_NAME":"United States","count":4}
105 | {"ORIGIN_COUNTRY_NAME":"Turks and Caicos Islands","DEST_COUNTRY_NAME":"United States","count":236}
106 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Croatia","count":2}
107 | {"ORIGIN_COUNTRY_NAME":"Pakistan","DEST_COUNTRY_NAME":"United States","count":12}
108 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Cyprus","count":1}
109 | {"ORIGIN_COUNTRY_NAME":"Honduras","DEST_COUNTRY_NAME":"United States","count":407}
110 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Fiji","count":24}
111 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Qatar","count":108}
112 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Saint Kitts and Nevis","count":139}
113 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Kuwait","count":32}
114 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Taiwan","count":266}
115 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Haiti","count":226}
116 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Canada","count":8399}
117 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Federated States of Micronesia","count":69}
118 | {"ORIGIN_COUNTRY_NAME":"Liberia","DEST_COUNTRY_NAME":"United States","count":2}
119 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Jamaica","count":666}
120 | {"ORIGIN_COUNTRY_NAME":"Malta","DEST_COUNTRY_NAME":"United States","count":2}
121 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Dominican Republic","count":1353}
122 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Japan","count":1548}
123 | {"ORIGIN_COUNTRY_NAME":"Lithuania","DEST_COUNTRY_NAME":"United States","count":1}
124 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Finland","count":26}
125 | {"ORIGIN_COUNTRY_NAME":"Guadeloupe","DEST_COUNTRY_NAME":"United States","count":59}
126 | {"ORIGIN_COUNTRY_NAME":"Ukraine","DEST_COUNTRY_NAME":"United States","count":13}
127 | {"ORIGIN_COUNTRY_NAME":"France","DEST_COUNTRY_NAME":"United States","count":952}
128 | {"ORIGIN_COUNTRY_NAME":"Norway","DEST_COUNTRY_NAME":"United States","count":115}
129 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Aruba","count":346}
130 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"French Guiana","count":5}
131 | {"ORIGIN_COUNTRY_NAME":"Kiribati","DEST_COUNTRY_NAME":"United States","count":35}
132 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"India","count":61}
133 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"British Virgin Islands","count":107}
134 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Brazil","count":853}
135 | {"ORIGIN_COUNTRY_NAME":"Germany","DEST_COUNTRY_NAME":"United States","count":1336}
136 | {"ORIGIN_COUNTRY_NAME":"New Zealand","DEST_COUNTRY_NAME":"United States","count":74}
137 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"French Polynesia","count":43}
138 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"United Arab Emirates","count":320}
139 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Singapore","count":3}
140 | {"ORIGIN_COUNTRY_NAME":"Mexico","DEST_COUNTRY_NAME":"United States","count":7187}
141 | {"ORIGIN_COUNTRY_NAME":"Sweden","DEST_COUNTRY_NAME":"United States","count":119}
142 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Netherlands","count":776}
143 | {"ORIGIN_COUNTRY_NAME":"Martinique","DEST_COUNTRY_NAME":"United States","count":43}
144 | {"ORIGIN_COUNTRY_NAME":"United Arab Emirates","DEST_COUNTRY_NAME":"United States","count":313}
145 | {"ORIGIN_COUNTRY_NAME":"Bulgaria","DEST_COUNTRY_NAME":"United States","count":1}
146 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Denmark","count":153}
147 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"China","count":772}
148 | {"ORIGIN_COUNTRY_NAME":"Nicaragua","DEST_COUNTRY_NAME":"United States","count":201}
149 | {"ORIGIN_COUNTRY_NAME":"Philippines","DEST_COUNTRY_NAME":"United States","count":126}
150 | {"ORIGIN_COUNTRY_NAME":"Georgia","DEST_COUNTRY_NAME":"United States","count":1}
151 | {"ORIGIN_COUNTRY_NAME":"Belgium","DEST_COUNTRY_NAME":"United States","count":228}
152 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Cayman Islands","count":314}
153 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Argentina","count":180}
154 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Peru","count":279}
155 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"South Africa","count":36}
156 | {"ORIGIN_COUNTRY_NAME":"Iceland","DEST_COUNTRY_NAME":"United States","count":202}
157 | {"ORIGIN_COUNTRY_NAME":"Argentina","DEST_COUNTRY_NAME":"United States","count":141}
158 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Spain","count":420}
159 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Bermuda","count":183}
160 | {"ORIGIN_COUNTRY_NAME":"Nigeria","DEST_COUNTRY_NAME":"United States","count":50}
161 | {"ORIGIN_COUNTRY_NAME":"Austria","DEST_COUNTRY_NAME":"United States","count":63}
162 | {"ORIGIN_COUNTRY_NAME":"Bonaire, Sint Eustatius, and Saba","DEST_COUNTRY_NAME":"United States","count":59}
163 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Kiribati","count":26}
164 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Saudi Arabia","count":83}
165 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Czech Republic","count":13}
166 | {"ORIGIN_COUNTRY_NAME":"Israel","DEST_COUNTRY_NAME":"United States","count":127}
167 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Belgium","count":259}
168 | {"ORIGIN_COUNTRY_NAME":"Saint Lucia","DEST_COUNTRY_NAME":"United States","count":136}
169 | {"ORIGIN_COUNTRY_NAME":"Bahrain","DEST_COUNTRY_NAME":"United States","count":1}
170 | {"ORIGIN_COUNTRY_NAME":"British Virgin Islands","DEST_COUNTRY_NAME":"United States","count":80}
171 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Curacao","count":90}
172 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Georgia","count":2}
173 | {"ORIGIN_COUNTRY_NAME":"Denmark","DEST_COUNTRY_NAME":"United States","count":152}
174 | {"ORIGIN_COUNTRY_NAME":"Guyana","DEST_COUNTRY_NAME":"United States","count":63}
175 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Philippines","count":134}
176 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Grenada","count":53}
177 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Cape Verde","count":20}
178 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Cote d'Ivoire","count":1}
179 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Ukraine","count":14}
180 | {"ORIGIN_COUNTRY_NAME":"Papua New Guinea","DEST_COUNTRY_NAME":"United States","count":1}
181 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Russia","count":176}
182 | {"ORIGIN_COUNTRY_NAME":"Saudi Arabia","DEST_COUNTRY_NAME":"United States","count":70}
183 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Guatemala","count":397}
184 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Saint Lucia","count":123}
185 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Paraguay","count":60}
186 | {"ORIGIN_COUNTRY_NAME":"Curacao","DEST_COUNTRY_NAME":"United States","count":83}
187 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Kosovo","count":1}
188 | {"ORIGIN_COUNTRY_NAME":"Taiwan","DEST_COUNTRY_NAME":"United States","count":235}
189 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Tunisia","count":3}
190 | {"ORIGIN_COUNTRY_NAME":"South Africa","DEST_COUNTRY_NAME":"United States","count":40}
191 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Niger","count":2}
192 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Turkey","count":138}
193 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"United Kingdom","count":2025}
194 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Romania","count":14}
195 | {"ORIGIN_COUNTRY_NAME":"Greenland","DEST_COUNTRY_NAME":"United States","count":4}
196 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Papua New Guinea","count":3}
197 | {"ORIGIN_COUNTRY_NAME":"Spain","DEST_COUNTRY_NAME":"United States","count":442}
198 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Iraq","count":1}
199 | {"ORIGIN_COUNTRY_NAME":"Italy","DEST_COUNTRY_NAME":"United States","count":438}
200 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Cuba","count":466}
201 | {"ORIGIN_COUNTRY_NAME":"Switzerland","DEST_COUNTRY_NAME":"United States","count":305}
202 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Dominica","count":20}
203 | {"ORIGIN_COUNTRY_NAME":"Japan","DEST_COUNTRY_NAME":"United States","count":1496}
204 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Portugal","count":127}
205 | {"ORIGIN_COUNTRY_NAME":"Brazil","DEST_COUNTRY_NAME":"United States","count":619}
206 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Bahrain","count":19}
207 | {"ORIGIN_COUNTRY_NAME":"Peru","DEST_COUNTRY_NAME":"United States","count":337}
208 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Indonesia","count":1}
209 | {"ORIGIN_COUNTRY_NAME":"Belize","DEST_COUNTRY_NAME":"United States","count":193}
210 | {"ORIGIN_COUNTRY_NAME":"United Kingdom","DEST_COUNTRY_NAME":"United States","count":1970}
211 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Belize","count":188}
212 | {"ORIGIN_COUNTRY_NAME":"Ghana","DEST_COUNTRY_NAME":"United States","count":20}
213 | {"ORIGIN_COUNTRY_NAME":"Indonesia","DEST_COUNTRY_NAME":"United States","count":2}
214 | {"ORIGIN_COUNTRY_NAME":"Fiji","DEST_COUNTRY_NAME":"United States","count":25}
215 | {"ORIGIN_COUNTRY_NAME":"Canada","DEST_COUNTRY_NAME":"United States","count":8483}
216 | {"ORIGIN_COUNTRY_NAME":"Antigua and Barbuda","DEST_COUNTRY_NAME":"United States","count":117}
217 | {"ORIGIN_COUNTRY_NAME":"French Polynesia","DEST_COUNTRY_NAME":"United States","count":40}
218 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Nicaragua","count":179}
219 | {"ORIGIN_COUNTRY_NAME":"Latvia","DEST_COUNTRY_NAME":"United States","count":15}
220 | {"ORIGIN_COUNTRY_NAME":"Dominica","DEST_COUNTRY_NAME":"United States","count":27}
221 | {"ORIGIN_COUNTRY_NAME":"Czech Republic","DEST_COUNTRY_NAME":"United States","count":12}
222 | {"ORIGIN_COUNTRY_NAME":"Australia","DEST_COUNTRY_NAME":"United States","count":258}
223 | {"ORIGIN_COUNTRY_NAME":"Cook Islands","DEST_COUNTRY_NAME":"United States","count":13}
224 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Austria","count":62}
225 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Jordan","count":44}
226 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Palau","count":30}
227 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"South Korea","count":1048}
228 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Angola","count":15}
229 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Ghana","count":18}
230 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"New Caledonia","count":1}
231 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Guadeloupe","count":56}
232 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"France","count":935}
233 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Poland","count":32}
234 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Nigeria","count":59}
235 | {"ORIGIN_COUNTRY_NAME":"Uruguay","DEST_COUNTRY_NAME":"United States","count":13}
236 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Greenland","count":2}
237 | {"ORIGIN_COUNTRY_NAME":"Bermuda","DEST_COUNTRY_NAME":"United States","count":193}
238 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Chile","count":174}
239 | {"ORIGIN_COUNTRY_NAME":"Cuba","DEST_COUNTRY_NAME":"United States","count":478}
240 | {"ORIGIN_COUNTRY_NAME":"Montenegro","DEST_COUNTRY_NAME":"United States","count":1}
241 | {"ORIGIN_COUNTRY_NAME":"Colombia","DEST_COUNTRY_NAME":"United States","count":867}
242 | {"ORIGIN_COUNTRY_NAME":"Barbados","DEST_COUNTRY_NAME":"United States","count":130}
243 | {"ORIGIN_COUNTRY_NAME":"Qatar","DEST_COUNTRY_NAME":"United States","count":109}
244 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Australia","count":329}
245 | {"ORIGIN_COUNTRY_NAME":"Cayman Islands","DEST_COUNTRY_NAME":"United States","count":310}
246 | {"ORIGIN_COUNTRY_NAME":"Jordan","DEST_COUNTRY_NAME":"United States","count":44}
247 | {"ORIGIN_COUNTRY_NAME":"Namibia","DEST_COUNTRY_NAME":"United States","count":1}
248 | {"ORIGIN_COUNTRY_NAME":"Trinidad and Tobago","DEST_COUNTRY_NAME":"United States","count":217}
249 | {"ORIGIN_COUNTRY_NAME":"Bolivia","DEST_COUNTRY_NAME":"United States","count":13}
250 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Cook Islands","count":13}
251 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Bulgaria","count":3}
252 | {"ORIGIN_COUNTRY_NAME":"Saint Kitts and Nevis","DEST_COUNTRY_NAME":"United States","count":145}
253 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Uruguay","count":43}
254 | {"ORIGIN_COUNTRY_NAME":"Haiti","DEST_COUNTRY_NAME":"United States","count":225}
255 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Bonaire, Sint Eustatius, and Saba","count":58}
256 | {"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Greece","count":30}
257 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/data/customers.csv:
--------------------------------------------------------------------------------
1 | Id,CustomerType,FirstName,MiddleName,LastName
2 | 12,Diamond,Elena,,Moore
3 | 13,Bronze,Paul,Williams,Page
4 | 14,Bronze,Albert,,Thomas
5 | 15,Bronze,Sandra,Elizabeth,Faith
6 | 16,Gold,Robert,,Alexander
7 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/data/customers.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE customers (Id INT PRIMARY KEY, CustomerType VARCHAR(10), Firstname VARCHAR(20), MiddleName VARCHAR(20), LastName VARCHAR(20));
2 | INSERT INTO customers VALUES
3 | (12,'Diamond','Elena',,'Moore'),
4 | (13,'Bronze','Paul','Williams','Page');
5 | (14,'Bronze','Albert',,'Thomas');
6 | (15,'Bronze','Sandra','Elizabeth','Faith');
7 | (16,'Gold','Robert',,'Alexander');
--------------------------------------------------------------------------------
/Spark - The Definite Guide/data/secret.txt:
--------------------------------------------------------------------------------
1 | root
2 | secret
--------------------------------------------------------------------------------
/Spark - The Definite Guide/data/simple-ml/DatosNuevos.json:
--------------------------------------------------------------------------------
1 | {"color":"green","value1":9, "value2":10.433, "lab":"good"}
2 | {"color":"blue","value1":8, "value2":17.3423, "lab":"bad"}
3 | {"color":"red","value1":40, "value2":11.876, "lab":"good"}
4 | {"color":"red","value1":7, "value2":7.6, "lab":"bad"}
5 | {"color":"blue","value1":2, "value2":8.213, "lab":"good"}
--------------------------------------------------------------------------------
/Spark - The Definite Guide/data/simple-ml/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/data/simple-ml/_SUCCESS
--------------------------------------------------------------------------------
/Spark - The Definite Guide/data/test/part-r-00000-f5c243b9-a015-4a3b-a4a8-eca00f80f04c.json:
--------------------------------------------------------------------------------
1 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
2 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
3 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
4 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
5 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
6 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
7 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
8 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
9 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
10 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
11 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
12 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
13 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
14 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
15 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
16 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
17 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
18 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
19 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
20 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
21 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
22 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
23 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
24 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
25 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
26 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
27 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
28 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
29 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
30 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
31 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
32 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
33 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
34 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
35 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
36 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
37 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
38 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
39 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
40 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
41 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
42 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
43 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
44 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
45 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
46 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
47 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
48 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
49 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
50 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
51 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
52 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
53 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
54 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
55 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
56 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
57 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
58 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
59 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
60 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
61 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
62 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
63 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
64 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
65 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
66 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
67 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
68 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
69 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
70 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
71 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
72 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
73 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
74 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
75 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
76 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
77 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
78 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
79 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
80 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
81 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
82 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
83 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
84 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
85 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
86 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
87 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
88 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
89 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
90 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
91 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
92 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
93 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
94 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
95 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
96 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
97 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
98 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
99 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
100 | {"lab":"good","color":"green","value1":1,"value2":14.386294994851129}
101 | {"lab":"bad","color":"blue","value1":8,"value2":14.386294994851129}
102 | {"lab":"bad","color":"blue","value1":12,"value2":14.386294994851129}
103 | {"lab":"good","color":"green","value1":15,"value2":38.97187133755819}
104 | {"lab":"good","color":"green","value1":12,"value2":14.386294994851129}
105 | {"lab":"bad","color":"green","value1":16,"value2":14.386294994851129}
106 | {"lab":"good","color":"red","value1":35,"value2":14.386294994851129}
107 | {"lab":"bad","color":"red","value1":1,"value2":38.97187133755819}
108 | {"lab":"bad","color":"red","value1":2,"value2":14.386294994851129}
109 | {"lab":"bad","color":"red","value1":16,"value2":14.386294994851129}
110 | {"lab":"good","color":"red","value1":45,"value2":38.97187133755819}
111 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/data/transactions.csv:
--------------------------------------------------------------------------------
1 | TransactionId,CustomerID,Merchant,Product,TotalAmount,Date
2 | 1,14,amazon.com.uk,[shirt,shoes],150,2021-04-08
3 | 2,12,marksandspencer.com,[short,shirt],50,2021-04-08
4 | 3,14,amazon.com.uk,[smartphone,charger],450,2021-04-09
5 | 4,12,tesco.com,[fruits,meat,wholegrains],75,2021-04-09
6 | 5,13,apple.com.uk,[charger,headphone],120,2021-04-09
7 | 6,15,e.leclerc,[smartphone],550,2021-04-10
8 | 7,14,zalando.com,[shoes],45,2021-04-11
9 | 8,13,zalando.com,[handbag,jumpsuit],250,2021-04-13
10 | 9,15,amazon.com,[books],50,2021-04-13
11 | 10,12,amazon.com,[necklaces,boots],350,2021-04-13
12 | 11,14,amazon.com.uk,[desktop],850,2021-04-14
13 | 12,15,e.leclerc,[smartphone],450,2021-04-14
14 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_10/README.md:
--------------------------------------------------------------------------------
1 | ## Session 10
2 | ### Chapters 11,12,13 & 14 - Datasets and Resilient Distributed Datasets (RDDs)
3 |
4 | 
5 |
6 | ### Resumen
7 | Durante esta sesión conocimos a la API denominada Dataset, antes de profundizarnos respecto a la estructura de datos central de Apache Spark: el Resilient Distributed Dataset (RDD)
8 |
9 | #### Grabación de la sesión
10 |
11 | [](https://www.youtube.com/watch?v=oND_jxWRtjo)
12 |
13 | #### Nuestras redes sociales
14 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
15 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
16 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
17 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
18 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_11/README.md:
--------------------------------------------------------------------------------
1 | ## Session 11
2 | ### Part IV - Production Applications (1/2)
3 |
4 | 
5 |
6 | *Observación: esta es la primera de dos sesiones sobre Part IV. Production Applications.*
7 |
8 | ### Resumen
9 | Hasta ahora, hemos enfocado en las propiedades de Spark como interfaz de programación. En esta sesión, nos enfocamos en los tópicos relevantes sobre la implementación de aplicaciones en Spark, conociendo la arquitectura, componentes y ciclo de vida de una aplicación, los pasos necesarios para ejecutarlo, cómo desarrollarlo y qué opciones Spark nos ofrece para implementarlo.
10 |
11 | #### Grabación de la sesión
12 |
13 | [](https://www.youtube.com/watch?v=FbwINmuBCrw)
14 |
15 | #### Nuestras redes sociales
16 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
17 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
18 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
19 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
20 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_12/README.md:
--------------------------------------------------------------------------------
1 | ## Session 12
2 | ### Part IV - Production Applications (2/2) with Spark on Kubernetes
3 |
4 | 
5 |
6 | *Observación: esta es la segunda sesión sobre Part IV. Production Applications.*
7 |
8 | ### Resumen
9 | En esta sesión, nos enfocamos en los tópicos relevantes respecto a la implementación de aplicaciones en Spark dentro de un cluster de Kubernetes. Kubernetes fue incorporado como Resource Manager en Spark 2.3, aunque fue marcado como GA (Generally Available) y "production-ready" con el Spark 3.1. Iremos conocer la arquitectura de un cluster de Kubernetes, las opciones que nos ofrece Spark y los pasos necesarios para implementar una aplicación en Kubernetes.
10 |
11 | Entre los temas relevantes de la sesión se encontraban:
12 |
13 | * La arquitectura de un cluster de Kubernetes
14 | * ¿Porqué Spark en Kubernetes?
15 | * Implementación de una aplicación mediante *spark-submit*
16 | * Implementación de una aplicación mediante un Spark Operator
17 | * Algunas opciones de monitoreo y depuración que nos ofrece Spark en Kubernetes
18 |
19 | Una artículo detallando los pasos llevados a cabo durante esta sesión fue escrito y se encuentra [aquí](https://www.kauvinlucas.com/projects/deploying-and-monitoring-spark-applications-with-kubernetes/es.html).
20 |
21 | Se implementó una aplicación en un cluster de Kubernetes utilizando los archivos de este [repositorio en Github](https://github.com/kauvinlucas/spark-kubernetes).
22 |
23 | #### Grabación de la sesión
24 | [](https://www.youtube.com/watch?v=6_aVEcGob98)
25 |
26 |
27 | #### Nuestras redes sociales
28 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
29 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
30 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
31 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
32 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_13/README.md:
--------------------------------------------------------------------------------
1 | ## Session 13
2 | ### Chapters 20 & 21 - Structured Streaming Fundamentals
3 |
4 | 
5 |
6 | ### Resumen
7 | En esta sesión, conoceremos las ventajas del procesamiento en streaming y nos introduciremos al tema del Structured Streaming en Apache Spark. El procesamiento en streaming es clave en diversas aplicaciones de Big Data. Apache Spark tiene una larga historia de soporte al procesamiento en streaming. El API de Structured Streaming fue la más reciente incorporación de Apache Spark, que integra las virtudes relacionadas a la facilidad y optimización de las consultas en Spark SQL.
8 |
9 | #### Grabación de la sesión
10 | [](https://www.youtube.com/watch?v=y4DWXnEIgeM)
11 |
12 | #### Nuestras redes sociales
13 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
14 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
15 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
16 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
17 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_14/README.md:
--------------------------------------------------------------------------------
1 | ## Session 14
2 | ### Chapters 22 & 23 - Event-Time and Stateful Processing
3 |
4 | 
5 |
6 | ### Resumen
7 | En la sesión anterior, hemos conocido los aspectos relevantes del API de Structured Streaming. Ahora vamos a trabajar con el tema del procesamiento de event time. Con este modelo de procesamiento se busca analizar la información con respecto al tiempo en que fue creado. Esto tiene implicaciones importantes durante el procesamiento en streaming, especialmente en entornos de producción, ya que sería necesario mantener un estado relevante para actualizar la información antes de meterlo al sink.
8 |
9 | Durante la sesión, se presentó un pequeño proyecto de ingesta de datos del Twitter a Apache Kafka y lectura de los mismos en streaming dentro de PySpark. El repositorio de este proyecto se encuentra [aquí](https://github.com/kauvinlucas/pyspark-stateful-processing-with-twitter-kafka)
10 |
11 | #### Grabación de la sesión
12 | [](https://www.youtube.com/watch?v=PAXTLdXDhDk)
13 |
14 | #### Nuestras redes sociales
15 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
16 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
17 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
18 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
19 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_15/README.md:
--------------------------------------------------------------------------------
1 | ## Session 15
2 | ### Chapter 24 - Advanced Analytics and Machine Learning Overview
3 |
4 | 
5 |
6 | ### Resumen
7 | Más allá del análisis en gran escala con Spark SQL y del procesamiento en streaming con Structured Streaming, Spark también brinda soporte para la analítica avanzada con aprendizaje automático y analítica de grafos en entornos distribuidos.
8 |
9 | El notebook que se utilizó en Databricks para demostrar la construcción de una pipeline de machine learning con Spark ML se encuentra en [`Advanced Analytics - Demo Lab.ipynb`](Advanced%20Analytics%20-%20Demo%20Lab.ipynb).
10 |
11 | #### Grabación de la sesión
12 | [](https://www.youtube.com/watch?v=28T-vSLznsw)
13 |
14 |
15 | #### Nuestras redes sociales
16 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
17 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
18 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
19 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
20 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_16/README.md:
--------------------------------------------------------------------------------
1 | ## Session 16
2 | ### Chapter 25 - Preprocessing and Feature Engineering (1/3)
3 |
4 | 
5 |
6 | *Observación: esta es la primera de tres sesiones sobre este capítulo.*
7 |
8 | ### Resumen
9 | Cualquier científico de datos o ingeniero de machine learning que se precie sabe que preparar los datos para un modelo de aprendizaje automático es uno de los mayores desafíos del análisis predictivo. Para que el preprocesamiento e ingeniería de características tenga éxito, es necesario que el profesional posea un amplio conocimiento sobre el problema a resolver, los datos a procesar y las necesidades del modelo de aprendizaje automático para aprovechar estos datos.
10 |
11 | En esta sesión, nos enfocaremos en el tema del preprocesamiento e ingeniería de características de textos. El procesamiento de texto es una disciplina importante en la minería de datos y en el procesamiento de lenguaje natural (NLP). Spark ML nos ofrece diversas herramientas para convertir el texto en un vector representativo numérico. Exploraremos estas herramientas en un ejemplo sencillo de análsis de sentimiento de los tweets.
12 |
13 | El notebook que se utilizó para realizar el análsis se encuentra en [`Text Preprocessing and Feature Extraction.ipynb`](Text%20Preprocessing%20and%20Feature%20Extraction.ipynb). Los datos utilizados (`train.csv`) fueron extraídos de la competición denominada `Tweet Sentiment Extraction` de **Kaggle**. Puedes acceder a estos datos desde [aquí](https://www.kaggle.com/competitions/tweet-sentiment-extraction/data) (es necesario tener una cuenta en Kaggle y aceptar a los términos de la competición).
14 |
15 | #### Grabación de la sesión
16 | [](https://www.youtube.com/watch?v=T2B0ZJlYOqU)
17 |
18 |
19 | #### Nuestras redes sociales
20 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
21 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
22 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
23 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
24 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_16/Text Preprocessing and Feature Extraction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "dd9de4d6",
6 | "metadata": {},
7 | "source": [
8 | "# Preprocesamiento de texto y extracción de características en PySpark (para el análisis de sentimiento)\n",
9 | "\n",
10 | "En el presente notebook, procesaremos un conjunto de datos compuestos por tweets con el propósito de realizar un análisis de sentimiento con el módulo de regresión logística en Apache Spark. Se utilizará Spark ML y Spark NLP para las etapas de transformación de los datos e entrenamiento y evaluación del modelo de aprendizaje automático."
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "b8551211",
16 | "metadata": {},
17 | "source": [
18 | "## 1. Importar las librerías de Spark ML"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "id": "55d05a8a",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import findspark\n",
29 | "findspark.init('/usr/local/spark') #Especificar la ruta de Apache Spark"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "id": "297f7fc0",
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "from pyspark.sql import SparkSession\n",
40 | "import pyspark.sql.functions as f\n",
41 | "import pyspark.sql.types as t\n",
42 | "from pyspark.ml.feature import (\n",
43 | " RegexTokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF, StringIndexer\n",
44 | ")\n",
45 | "from pyspark.ml.classification import LogisticRegression\n",
46 | "from pyspark.ml import Pipeline\n",
47 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "id": "c413a388",
53 | "metadata": {},
54 | "source": [
55 | "## 2. Importar los datos"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "id": "72c7d9cb",
61 | "metadata": {},
62 | "source": [
63 | "Importaremos datos del Twitter en csv conteniendo tweets en la columna `select_text` y clases en la columna `sentiment` que representan el sentimiento de cada tweet (\"negativo\", \"neutral\" o \"positivo\").\n",
64 | "\n",
65 | "Dicho conjunto de datos fue utilizado en la competición **Tweet Sentiment Extraction** de Kaggle."
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "id": "08f06d52",
72 | "metadata": {
73 | "scrolled": false
74 | },
75 | "outputs": [],
76 | "source": [
77 | "# Iniciar SparkSession\n",
78 | "spark = SparkSession \\\n",
79 | " .builder \\\n",
80 | " .getOrCreate()\n",
81 | "\n",
82 | "# Leer los datos\n",
83 | "df = spark\\\n",
84 | " .read\\\n",
85 | " .option(\"header\", \"true\")\\\n",
86 | " .option(\"inferSchema\", True)\\\n",
87 | " .csv(\"train.csv\")\\ # Especificar la ruta del archivo train.csv\n",
88 | " .select(\"selected_text\", \"sentiment\")\n",
89 | "\n",
90 | "spark.sparkContext.setLogLevel(\"ERROR\")\n",
91 | "\n",
92 | "# Reducir el número de Shuffle partitions para 5\n",
93 | "spark.conf.set(\"spark.sql.shuffle.partitions\", \"5\")"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "id": "7980a55b",
99 | "metadata": {},
100 | "source": [
101 | "## 3. Realizar una limpieza en los datos\n",
102 | "\n",
103 | "Antes de preprocesarlos, es necesario que nuestros datos estén limpios. Es imperativo explorar los datos para encontrar errores e imperfecciones que pueden tener un impacto negativo durante la etapa de preprocesamiento.\n",
104 | "\n",
105 | "Es muy importante asegurarnos de que **no hayan valores faltantes o nulos en el conjunto de datos**. La presencia de estos valores es la causa frecuente de problemas cuando trabajamos con Spark ML."
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "id": "5512f460",
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "# Quitar los valores faltantes o nulos\n",
116 | "df_cleaned = df\\\n",
117 | " .dropna()\\\n",
118 | " .select(\"selected_text\",\"sentiment\")\n",
119 | "\n",
120 | "# Contar la cantidad de filas del conjunto de datos\n",
121 | "df_cleaned.count()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "id": "96b4d6ef",
127 | "metadata": {},
128 | "source": [
129 | "## 4. Preprocesamiento de los datos con RegexTokenizer y StopWordsRemover\n",
130 | "\n",
131 | "Utilizaremos los siguientes **tranformers** para preprocesar nuestros datos:\n",
132 | "1. RegexTokenizer: transforma el texto en un conjunto de tokens (palabras) aplicando una regular expression (regex); y\n",
133 | "2. StopWordsRemover: remueve los tokens frecuentes de cada texto."
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "id": "fc0e70e3",
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "# Extraer las palabras de cada texto mediante una expresión regular (regex)\n",
144 | "regextokenizer = RegexTokenizer(inputCol=\"selected_text\", outputCol=\"words\", pattern=\"\\\\W\")\n",
145 | "\n",
146 | "# Remover las palabras comúnes del texto\n",
147 | "englishStopWords = StopWordsRemover.loadDefaultStopWords(\"english\")\n",
148 | "stops = StopWordsRemover()\\\n",
149 | " .setStopWords(englishStopWords)\\\n",
150 | " .setInputCol(\"words\")\\\n",
151 | " .setOutputCol(\"preprocessed\")\n",
152 | "\n",
153 | "# Construir la pipeline de preprocesamiento\n",
154 | "pipeline = Pipeline(stages=[regextokenizer, stops])"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "id": "6a38f266",
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# Aplicar la pipeline de preprocesamiento\n",
165 | "pipelineFit = pipeline.fit(df_cleaned)\n",
166 | "countvectorizer_transformed = pipelineFit.transform(df_cleaned)\n",
167 | "\n",
168 | "# Remover filas con arrays vacios\n",
169 | "filtered = countvectorizer_transformed.filter(f.size('preprocessed') > 0)\n",
170 | "\n",
171 | "# Seleccionar la variable de entrada y la variable de salida\n",
172 | "preprocessed = filtered.select(\"sentiment\", \"preprocessed\")"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "id": "1c5f5ab1",
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "preprocessed.show(10, False)"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "id": "3a912805",
188 | "metadata": {},
189 | "source": [
190 | "## 5. Extracción de características con el CountVectorizer\n",
191 | "\n",
192 | "En un modelo de **TF-IDF**, el CountVectorizer puede ser utilizado para calcular el TF o *Term Frequency*. El TF es un vector que representaría la ocurrencia de cada palabra dentro de cada documento. El IDF intentar asignar pesos a cada elemento del TF. Las palabras que aparecen con mayor frecuencia en los documentos reciben un peso menor en comparación con la palabras menos comúnes en los documentos.\n",
193 | "\n",
194 | "El CountVectorizer es un transformer que hace un recuento de cada palabra en el documento y los expresa en un vector escaso.\n",
195 | "\n",
196 | "También hemos transformado la variable de salida a una representación numérica de las categorías mediante el StringIndexer."
197 | ]
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "id": "926102a0",
202 | "metadata": {},
203 | "source": [
204 | "#### 5.1. Construir la pipeline de extracción"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "id": "94a3aaf6",
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "# Transformar la variable de entrada en vectores de representación mediante el CountVectorizer\n",
215 | "cv = CountVectorizer()\\\n",
216 | " .setInputCol(\"preprocessed\")\\\n",
217 | " .setOutputCol(\"TFOut\")\\\n",
218 | " .setVocabSize(500)\\\n",
219 | " .setMinTF(1)\\\n",
220 | " .setMinDF(2)\n",
221 | "\n",
222 | "# Aplicar el IDF\n",
223 | "idf = IDF()\\\n",
224 | " .setInputCol(\"TFOut\")\\\n",
225 | " .setOutputCol(\"features\")\\\n",
226 | " .setMinDocFreq(2)\n",
227 | "\n",
228 | "# Representar la variable de salida en términos numéricos\n",
229 | "label_stringIdx = StringIndexer(inputCol = \"sentiment\", outputCol = \"label\")\n",
230 | "\n",
231 | "# Construir la pipeline\n",
232 | "pipeline = Pipeline(stages=[cv, idf, label_stringIdx])"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "id": "2462a514",
238 | "metadata": {},
239 | "source": [
240 | "#### 5.2 Aplicar la pipeline de transformación"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "id": "b62b2c25",
247 | "metadata": {
248 | "scrolled": false
249 | },
250 | "outputs": [],
251 | "source": [
252 | "pipelineFit = pipeline.fit(preprocessed)\n",
253 | "countvectorizer_transformed = pipelineFit.transform(preprocessed).select(\"features\", \"label\")\n",
254 | "countvectorizer_transformed.show(10, False)"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "id": "8ee83620",
260 | "metadata": {},
261 | "source": [
262 | "#### 5.3 Crear un modelo de regresión logística y evaluarlo"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "id": "50c2864f",
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "# Hacer el fit y transform\n",
273 | "lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)\n",
274 | "lrModel = lr.fit(countvectorizer_transformed)\n",
275 | "predictions = lrModel.transform(countvectorizer_transformed)\n",
276 | "\n",
277 | "# Hacer el predict\n",
278 | "evaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\", metricName=\"f1\")\n",
279 | "print(\"La precisión del modelo es del {:0.2f}%\".format(evaluator.evaluate(predictions)*100))"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "id": "a9243f83",
285 | "metadata": {},
286 | "source": [
287 | "## 6. Extracción de características con el HashingTF"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "id": "4796c6fa",
293 | "metadata": {},
294 | "source": [
295 | "#### 6.1. Construir la pipeline de extracción"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "id": "76ed037b",
302 | "metadata": {},
303 | "outputs": [],
304 | "source": [
305 | "# Transformar la variable de entrada en vectores de representación mediante el HashingTF\n",
306 | "tf = HashingTF()\\\n",
307 | " .setInputCol(\"preprocessed\")\\\n",
308 | " .setOutputCol(\"TFOut\")\\\n",
309 | " .setNumFeatures(10000)\n",
310 | "\n",
311 | "# Aplicar el IDF\n",
312 | "idf = IDF()\\\n",
313 | " .setInputCol(\"TFOut\")\\\n",
314 | " .setOutputCol(\"features\")\\\n",
315 | " .setMinDocFreq(2)\n",
316 | "\n",
317 | "# Representar la variable de salida en términos numéricos\n",
318 | "label_stringIdx = StringIndexer(inputCol = \"sentiment\", outputCol = \"label\")\n",
319 | "\n",
320 | "# Construir la pipeline\n",
321 | "pipeline = Pipeline(stages=[tf, idf, label_stringIdx])"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "id": "28c62453",
327 | "metadata": {},
328 | "source": [
329 | "#### 6.2 Aplicar la pipeline de extracción"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "id": "cce8d356",
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "pipelineFit = pipeline.fit(preprocessed)\n",
340 | "hashingtf_transformed = pipelineFit.transform(preprocessed).select(\"features\", \"label\")\n",
341 | "hashingtf_transformed.show(10, False)"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "id": "da3080df",
347 | "metadata": {},
348 | "source": [
349 | "#### 6.3 Crear un modelo de regresión logística y evaluarlo"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "id": "5f87f434",
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "# Hacer el fit y transform\n",
360 | "lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)\n",
361 | "lrModel = lr.fit(hashingtf_transformed)\n",
362 | "predictions = lrModel.transform(hashingtf_transformed)\n",
363 | "\n",
364 | "# Evaluar con MulticlassClassificationEvaluator\n",
365 | "print(\"La precisión del modelo es del {:0.2f}%\".format(evaluator.evaluate(predictions)*100))"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "id": "3838c4cb",
371 | "metadata": {},
372 | "source": [
373 | "## 7. Extracción de características con el Word2Vec"
374 | ]
375 | },
376 | {
377 | "cell_type": "markdown",
378 | "id": "f90ec0de",
379 | "metadata": {},
380 | "source": [
381 | "#### 7.1. Construir y aplicar la pipeline de extracción"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "id": "8a0dc2e5",
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "from pyspark.ml.feature import Word2Vec\n",
392 | "word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol=\"preprocessed\",\n",
393 | "outputCol=\"features\")\n",
394 | "\n",
395 | "# Representar la variable de salida en términos numéricos\n",
396 | "label_stringIdx = StringIndexer(inputCol = \"sentiment\", outputCol = \"label\")\n",
397 | "\n",
398 | "# Aplicar la pipeline de extracción\n",
399 | "pipeline = Pipeline(stages=[word2Vec, label_stringIdx])\n",
400 | "pipelineFit = pipeline.fit(preprocessed)\n",
401 | "word2vec_transformed = pipelineFit.transform(preprocessed).select(\"features\", \"label\")\n",
402 | "word2vec_transformed.show(10, False)"
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "id": "7afa6b89",
408 | "metadata": {},
409 | "source": [
410 | "#### 7.2 Crear un modelo de regresión logística y evaluarlo"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "id": "f90a85e7",
417 | "metadata": {},
418 | "outputs": [],
419 | "source": [
420 | "# Hacer el fit y transform\n",
421 | "lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)\n",
422 | "lrModel = lr.fit(word2vec_transformed)\n",
423 | "predictions = lrModel.transform(word2vec_transformed)\n",
424 | "\n",
425 | "# Evaluar con MulticlassClassificationEvaluator\n",
426 | "evaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\", metricName=\"f1\")\n",
427 | "print(\"La precisión del modelo es del {:0.2f}%\".format(evaluator.evaluate(predictions)*100))"
428 | ]
429 | },
430 | {
431 | "cell_type": "markdown",
432 | "id": "12bab4ef",
433 | "metadata": {},
434 | "source": [
435 | "## 8 Consideraciones finales"
436 | ]
437 | },
438 | {
439 | "cell_type": "markdown",
440 | "id": "fa2d06b3",
441 | "metadata": {},
442 | "source": [
443 | "En este notebook, hemos realizado el preprocesamiento de texto, extracción de características y entrenado un modelo de regresión logística para clasificar el sentimiento de cada tweet, utilizando puramente los módulos de Spark Machine Learning. Con transformaciones sencillas, hemos lograr entrenar un modelo con una precisión del +80% sobre los datos de entrenamiento.\n",
444 | "\n",
445 | "Podríamos mejorar nuestras pipelines de procesamiento con Spark NLP, incorporando a nuestra pipeline otros procesadores como Stemmer y Lemmatizer para normalizar el texto."
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": null,
451 | "id": "abaac500",
452 | "metadata": {},
453 | "outputs": [],
454 | "source": []
455 | }
456 | ],
457 | "metadata": {
458 | "kernelspec": {
459 | "display_name": "Python 3 (ipykernel)",
460 | "language": "python",
461 | "name": "python3"
462 | },
463 | "language_info": {
464 | "codemirror_mode": {
465 | "name": "ipython",
466 | "version": 3
467 | },
468 | "file_extension": ".py",
469 | "mimetype": "text/x-python",
470 | "name": "python",
471 | "nbconvert_exporter": "python",
472 | "pygments_lexer": "ipython3",
473 | "version": "3.10.2"
474 | }
475 | },
476 | "nbformat": 4,
477 | "nbformat_minor": 5
478 | }
479 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_17/README.md:
--------------------------------------------------------------------------------
1 | ## Session 17 - preprocesamiento de datos continuos y categóricos con Spark ML
2 | ### Chapter 25 - Preprocessing and Feature Engineering (2/3)
3 |
4 | 
5 |
6 | *Observación: esta es la segunda de tres sesiones sobre este capítulo.*
7 |
8 | ### Resumen
9 | Cualquier científico de datos o ingeniero de machine learning que se precie sabe que preparar los datos para un modelo de aprendizaje automático es uno de los mayores desafíos del análisis predictivo. Para que el preprocesamiento e ingeniería de características tenga éxito, es necesario que el profesional posea un amplio conocimiento sobre el problema a resolver, los datos a procesar y las necesidades del modelo de aprendizaje automático para aprovechar estos datos.
10 |
11 | En esta sesión, nos enfocaremos en el preprocesamiento e ingeniería de características continuas y categóricas. Para las variables continuas, Spark ML posee transformadores para la normalización o estandarización y para el bucketing. Para las variables categóricas, es posible emplear tareas de codificación y decodificación de los valores.
12 |
13 | Preprocesaremos los datos continuos y categoricos en dos ejemplos:
14 | * Análisis de tasa de abandono de consumidores de tarjetas de crédito, y
15 | * Predicción de la volatilidad de precios de las opciones financieras.
16 |
17 | El notebook que se utilizó para realizar el análsis se encuentra en [`Feature Transformation.ipynb`](Feature%20Transformation.ipynb). Se utilizó dos conjuntos de datos: el **[Credit Card customers](https://www.kaggle.com/datasets/sakshigoyal7/credit-card-customers)** (en Kaggle) y los datos de la competición en Kaggle denominada **[Optiver Realized Volatility Prediction](https://www.kaggle.com/c/optiver-realized-volatility-prediction/overview)**.
18 |
19 | #### Grabación de la sesión
20 | [](https://www.youtube.com/watch?v=s_d3fM41dTY)
21 |
22 |
23 | #### Nuestras redes sociales
24 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
25 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
26 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
27 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
28 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_18/README.md:
--------------------------------------------------------------------------------
1 | ## Session 17 - selección y manipulación de características con Spark ML
2 | ### Chapter 25 - Preprocessing and Feature Engineering (3/3)
3 |
4 | 
5 |
6 | *Observación: esta es la última de tres sesiones sobre este capítulo.*
7 |
8 | ### Resumen
9 | Cualquier científico de datos o ingeniero de machine learning que se precie sabe que preparar los datos para un modelo de aprendizaje automático es uno de los mayores desafíos del análisis predictivo. Para que el preprocesamiento e ingeniería de características tenga éxito, es necesario que el profesional posea un amplio conocimiento sobre el problema a resolver, los datos a procesar y las necesidades del modelo de aprendizaje automático para aprovechar estos datos.
10 |
11 | En esta sesión, demostraremos algunos de los pasos para la selección de características y reducción de dimensionalidad con los módulos de Spark ML. La selección de características consiste en eliminar aquellas variables de entrada que sean irrelevantes o redundantes al problema, con el objetivo de reducir los tiempos de entrenamiento y mejorar la compartibilidad con el modelo de aprendizaje automático.
12 |
13 | El notebook que se utilizó durante la sesión se denomina [`Feature Selection.ipynb`](Feature%20Selection.ipynb). Los datos utilizados fueron extraídos de:
14 | * [Credit Risk Analysis](https://www.kaggle.com/datasets/rameshmehta/credit-risk-analysis) desde **Kaggle** y
15 | * [Swarm Behaviour Data Set](https://archive.ics.uci.edu/ml/datasets/Swarm+Behaviour) desde el **UC Irvine Machine Learning Repository**
16 |
17 | #### Grabación de la sesión
18 | [](https://www.youtube.com/watch?v=6oYcbV55YB8)
19 |
20 |
21 | #### Nuestras redes sociales
22 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
23 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
24 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
25 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
26 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_18/assets/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/sessions/session_18/assets/1.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_18/assets/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/sessions/session_18/assets/2.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_18/assets/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/sessions/session_18/assets/3.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_18/assets/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/sessions/session_18/assets/4.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_18/assets/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/sessions/session_18/assets/5.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_18/assets/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/sessions/session_18/assets/6.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_19/README.md:
--------------------------------------------------------------------------------
1 | ## Session 19 - Modelos de regresión con Spark Machine Learning
2 | ### Chapter 27 - Regression
3 |
4 | 
5 |
6 | ### Resumen
7 | En esta sesión, seguiremos estudiando los módulos del MLlib para resolver los problemas de aprendizaje automático en Apache Spark. Nos acercaremos a los modelos de regresión y exploraremos los hiperparámetros comúnmente ajustados.
8 |
9 | #### Grabación de la sesión
10 | [](https://www.youtube.com/watch?v=mHfXzC0OG4U)
11 |
12 |
13 | #### Nuestras redes sociales
14 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
15 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
16 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
17 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
18 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_20/README.md:
--------------------------------------------------------------------------------
1 | ## Session 20 - Modelos de clasificación y ajuste de hiperparámetros con Spark ML
2 | ### Chapter 26 - Classification
3 |
4 | 
5 |
6 | ### Resumen
7 | En esta sesión, seguiremos estudiando los modelos de MLlib para resolver los problemas de aprendizaje automático en Apache Spark. Nos acercaremos a los modelos de clasificación y aprenderemos a optimizar nuestros modelos en Spark.
8 |
9 |
10 | El notebook que se utilizó durante la sesión se denomina [`Classification.ipynb`](Classification.ipynb). Los datos utilizados fueron extraídos de:
11 | * [Credit Card customers](https://www.kaggle.com/datasets/sakshigoyal7/credit-card-customers) desde **Kaggle**
12 |
13 | #### Grabación de la sesión
14 | [](https://www.youtube.com/watch?v=7_u37ugJB1A)
15 |
16 |
17 | #### Nuestras redes sociales
18 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
19 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
20 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
21 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
22 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_20/assets/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/sessions/session_20/assets/1.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_20/assets/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/sessions/session_20/assets/2.png
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_20/assets/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataEngineering-LATAM/Spark-StudyClub/f7e6df425d8a1a770181f1f27e9139899561a90c/Spark - The Definite Guide/sessions/session_20/assets/3.jpg
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_21/README.md:
--------------------------------------------------------------------------------
1 | ## Session 21 - Modelos de recomendación con Spark Machine Learning
2 | ### Chapter 28 - Recomendation
3 |
4 | 
5 |
6 | ### Resumen
7 | Seguiremos estudiando los modelos de MLlib para resolver los problemas de aprendizaje automático en Apache Spark. En esta ocasión, revisaremos los modelos de recomendación. Exploraremos los algoritmos disponibles para implementar un sistema de recomendación con el enfoque en la filtración colaborativa.
8 |
9 | El notebook que se utilizó es el [`Recomendation System.scala`](Recomendation%20System.scala). Se utilizó el siguiente conjunto de datos: [`sample_movielens_ratings.txt`](sample_movielens_ratings.txt)
10 |
11 | #### Grabación de la sesión
12 | [](https://www.youtube.com/watch?v=Fpg5AbkzBiM)
13 |
14 |
15 | #### Nuestras redes sociales
16 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
17 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
18 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
19 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
20 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_21/sample_movielens_ratings.txt:
--------------------------------------------------------------------------------
1 | 0::2::3::1424380312
2 | 0::3::1::1424380312
3 | 0::5::2::1424380312
4 | 0::9::4::1424380312
5 | 0::11::1::1424380312
6 | 0::12::2::1424380312
7 | 0::15::1::1424380312
8 | 0::17::1::1424380312
9 | 0::19::1::1424380312
10 | 0::21::1::1424380312
11 | 0::23::1::1424380312
12 | 0::26::3::1424380312
13 | 0::27::1::1424380312
14 | 0::28::1::1424380312
15 | 0::29::1::1424380312
16 | 0::30::1::1424380312
17 | 0::31::1::1424380312
18 | 0::34::1::1424380312
19 | 0::37::1::1424380312
20 | 0::41::2::1424380312
21 | 0::44::1::1424380312
22 | 0::45::2::1424380312
23 | 0::46::1::1424380312
24 | 0::47::1::1424380312
25 | 0::48::1::1424380312
26 | 0::50::1::1424380312
27 | 0::51::1::1424380312
28 | 0::54::1::1424380312
29 | 0::55::1::1424380312
30 | 0::59::2::1424380312
31 | 0::61::2::1424380312
32 | 0::64::1::1424380312
33 | 0::67::1::1424380312
34 | 0::68::1::1424380312
35 | 0::69::1::1424380312
36 | 0::71::1::1424380312
37 | 0::72::1::1424380312
38 | 0::77::2::1424380312
39 | 0::79::1::1424380312
40 | 0::83::1::1424380312
41 | 0::87::1::1424380312
42 | 0::89::2::1424380312
43 | 0::91::3::1424380312
44 | 0::92::4::1424380312
45 | 0::94::1::1424380312
46 | 0::95::2::1424380312
47 | 0::96::1::1424380312
48 | 0::98::1::1424380312
49 | 0::99::1::1424380312
50 | 1::2::2::1424380312
51 | 1::3::1::1424380312
52 | 1::4::2::1424380312
53 | 1::6::1::1424380312
54 | 1::9::3::1424380312
55 | 1::12::1::1424380312
56 | 1::13::1::1424380312
57 | 1::14::1::1424380312
58 | 1::16::1::1424380312
59 | 1::19::1::1424380312
60 | 1::21::3::1424380312
61 | 1::27::1::1424380312
62 | 1::28::3::1424380312
63 | 1::33::1::1424380312
64 | 1::36::2::1424380312
65 | 1::37::1::1424380312
66 | 1::40::1::1424380312
67 | 1::41::2::1424380312
68 | 1::43::1::1424380312
69 | 1::44::1::1424380312
70 | 1::47::1::1424380312
71 | 1::50::1::1424380312
72 | 1::54::1::1424380312
73 | 1::56::2::1424380312
74 | 1::57::1::1424380312
75 | 1::58::1::1424380312
76 | 1::60::1::1424380312
77 | 1::62::4::1424380312
78 | 1::63::1::1424380312
79 | 1::67::1::1424380312
80 | 1::68::4::1424380312
81 | 1::70::2::1424380312
82 | 1::72::1::1424380312
83 | 1::73::1::1424380312
84 | 1::74::2::1424380312
85 | 1::76::1::1424380312
86 | 1::77::3::1424380312
87 | 1::78::1::1424380312
88 | 1::81::1::1424380312
89 | 1::82::1::1424380312
90 | 1::85::3::1424380312
91 | 1::86::2::1424380312
92 | 1::88::2::1424380312
93 | 1::91::1::1424380312
94 | 1::92::2::1424380312
95 | 1::93::1::1424380312
96 | 1::94::2::1424380312
97 | 1::96::1::1424380312
98 | 1::97::1::1424380312
99 | 2::4::3::1424380312
100 | 2::6::1::1424380312
101 | 2::8::5::1424380312
102 | 2::9::1::1424380312
103 | 2::10::1::1424380312
104 | 2::12::3::1424380312
105 | 2::13::1::1424380312
106 | 2::15::2::1424380312
107 | 2::18::2::1424380312
108 | 2::19::4::1424380312
109 | 2::22::1::1424380312
110 | 2::26::1::1424380312
111 | 2::28::1::1424380312
112 | 2::34::4::1424380312
113 | 2::35::1::1424380312
114 | 2::37::5::1424380312
115 | 2::38::1::1424380312
116 | 2::39::5::1424380312
117 | 2::40::4::1424380312
118 | 2::47::1::1424380312
119 | 2::50::1::1424380312
120 | 2::52::2::1424380312
121 | 2::54::1::1424380312
122 | 2::55::1::1424380312
123 | 2::57::2::1424380312
124 | 2::58::2::1424380312
125 | 2::59::1::1424380312
126 | 2::61::1::1424380312
127 | 2::62::1::1424380312
128 | 2::64::1::1424380312
129 | 2::65::1::1424380312
130 | 2::66::3::1424380312
131 | 2::68::1::1424380312
132 | 2::71::3::1424380312
133 | 2::76::1::1424380312
134 | 2::77::1::1424380312
135 | 2::78::1::1424380312
136 | 2::80::1::1424380312
137 | 2::83::5::1424380312
138 | 2::85::1::1424380312
139 | 2::87::2::1424380312
140 | 2::88::1::1424380312
141 | 2::89::4::1424380312
142 | 2::90::1::1424380312
143 | 2::92::4::1424380312
144 | 2::93::5::1424380312
145 | 3::0::1::1424380312
146 | 3::1::1::1424380312
147 | 3::2::1::1424380312
148 | 3::7::3::1424380312
149 | 3::8::3::1424380312
150 | 3::9::1::1424380312
151 | 3::14::1::1424380312
152 | 3::15::1::1424380312
153 | 3::16::1::1424380312
154 | 3::18::4::1424380312
155 | 3::19::1::1424380312
156 | 3::24::3::1424380312
157 | 3::26::1::1424380312
158 | 3::29::3::1424380312
159 | 3::33::1::1424380312
160 | 3::34::3::1424380312
161 | 3::35::1::1424380312
162 | 3::36::3::1424380312
163 | 3::37::1::1424380312
164 | 3::38::2::1424380312
165 | 3::43::1::1424380312
166 | 3::44::1::1424380312
167 | 3::46::1::1424380312
168 | 3::47::1::1424380312
169 | 3::51::5::1424380312
170 | 3::52::3::1424380312
171 | 3::56::1::1424380312
172 | 3::58::1::1424380312
173 | 3::60::3::1424380312
174 | 3::62::1::1424380312
175 | 3::65::2::1424380312
176 | 3::66::1::1424380312
177 | 3::67::1::1424380312
178 | 3::68::2::1424380312
179 | 3::70::1::1424380312
180 | 3::72::2::1424380312
181 | 3::76::3::1424380312
182 | 3::79::3::1424380312
183 | 3::80::4::1424380312
184 | 3::81::1::1424380312
185 | 3::83::1::1424380312
186 | 3::84::1::1424380312
187 | 3::86::1::1424380312
188 | 3::87::2::1424380312
189 | 3::88::4::1424380312
190 | 3::89::1::1424380312
191 | 3::91::1::1424380312
192 | 3::94::3::1424380312
193 | 4::1::1::1424380312
194 | 4::6::1::1424380312
195 | 4::8::1::1424380312
196 | 4::9::1::1424380312
197 | 4::10::1::1424380312
198 | 4::11::1::1424380312
199 | 4::12::1::1424380312
200 | 4::13::1::1424380312
201 | 4::14::2::1424380312
202 | 4::15::1::1424380312
203 | 4::17::1::1424380312
204 | 4::20::1::1424380312
205 | 4::22::1::1424380312
206 | 4::23::1::1424380312
207 | 4::24::1::1424380312
208 | 4::29::4::1424380312
209 | 4::30::1::1424380312
210 | 4::31::1::1424380312
211 | 4::34::1::1424380312
212 | 4::35::1::1424380312
213 | 4::36::1::1424380312
214 | 4::39::2::1424380312
215 | 4::40::3::1424380312
216 | 4::41::4::1424380312
217 | 4::43::2::1424380312
218 | 4::44::1::1424380312
219 | 4::45::1::1424380312
220 | 4::46::1::1424380312
221 | 4::47::1::1424380312
222 | 4::49::2::1424380312
223 | 4::50::1::1424380312
224 | 4::51::1::1424380312
225 | 4::52::4::1424380312
226 | 4::54::1::1424380312
227 | 4::55::1::1424380312
228 | 4::60::3::1424380312
229 | 4::61::1::1424380312
230 | 4::62::4::1424380312
231 | 4::63::3::1424380312
232 | 4::65::1::1424380312
233 | 4::67::2::1424380312
234 | 4::69::1::1424380312
235 | 4::70::4::1424380312
236 | 4::71::1::1424380312
237 | 4::73::1::1424380312
238 | 4::78::1::1424380312
239 | 4::84::1::1424380312
240 | 4::85::1::1424380312
241 | 4::87::3::1424380312
242 | 4::88::3::1424380312
243 | 4::89::2::1424380312
244 | 4::96::1::1424380312
245 | 4::97::1::1424380312
246 | 4::98::1::1424380312
247 | 4::99::1::1424380312
248 | 5::0::1::1424380312
249 | 5::1::1::1424380312
250 | 5::4::1::1424380312
251 | 5::5::1::1424380312
252 | 5::8::1::1424380312
253 | 5::9::3::1424380312
254 | 5::10::2::1424380312
255 | 5::13::3::1424380312
256 | 5::15::1::1424380312
257 | 5::19::1::1424380312
258 | 5::20::3::1424380312
259 | 5::21::2::1424380312
260 | 5::23::3::1424380312
261 | 5::27::1::1424380312
262 | 5::28::1::1424380312
263 | 5::29::1::1424380312
264 | 5::31::1::1424380312
265 | 5::36::3::1424380312
266 | 5::38::2::1424380312
267 | 5::39::1::1424380312
268 | 5::42::1::1424380312
269 | 5::48::3::1424380312
270 | 5::49::4::1424380312
271 | 5::50::3::1424380312
272 | 5::51::1::1424380312
273 | 5::52::1::1424380312
274 | 5::54::1::1424380312
275 | 5::55::5::1424380312
276 | 5::56::3::1424380312
277 | 5::58::1::1424380312
278 | 5::60::1::1424380312
279 | 5::61::1::1424380312
280 | 5::64::3::1424380312
281 | 5::65::2::1424380312
282 | 5::68::4::1424380312
283 | 5::70::1::1424380312
284 | 5::71::1::1424380312
285 | 5::72::1::1424380312
286 | 5::74::1::1424380312
287 | 5::79::1::1424380312
288 | 5::81::2::1424380312
289 | 5::84::1::1424380312
290 | 5::85::1::1424380312
291 | 5::86::1::1424380312
292 | 5::88::1::1424380312
293 | 5::90::4::1424380312
294 | 5::91::2::1424380312
295 | 5::95::2::1424380312
296 | 5::99::1::1424380312
297 | 6::0::1::1424380312
298 | 6::1::1::1424380312
299 | 6::2::3::1424380312
300 | 6::5::1::1424380312
301 | 6::6::1::1424380312
302 | 6::9::1::1424380312
303 | 6::10::1::1424380312
304 | 6::15::2::1424380312
305 | 6::16::2::1424380312
306 | 6::17::1::1424380312
307 | 6::18::1::1424380312
308 | 6::20::1::1424380312
309 | 6::21::1::1424380312
310 | 6::22::1::1424380312
311 | 6::24::1::1424380312
312 | 6::25::5::1424380312
313 | 6::26::1::1424380312
314 | 6::28::1::1424380312
315 | 6::30::1::1424380312
316 | 6::33::1::1424380312
317 | 6::38::1::1424380312
318 | 6::39::1::1424380312
319 | 6::43::4::1424380312
320 | 6::44::1::1424380312
321 | 6::45::1::1424380312
322 | 6::48::1::1424380312
323 | 6::49::1::1424380312
324 | 6::50::1::1424380312
325 | 6::53::1::1424380312
326 | 6::54::1::1424380312
327 | 6::55::1::1424380312
328 | 6::56::1::1424380312
329 | 6::58::4::1424380312
330 | 6::59::1::1424380312
331 | 6::60::1::1424380312
332 | 6::61::3::1424380312
333 | 6::63::3::1424380312
334 | 6::66::1::1424380312
335 | 6::67::3::1424380312
336 | 6::68::1::1424380312
337 | 6::69::1::1424380312
338 | 6::71::2::1424380312
339 | 6::73::1::1424380312
340 | 6::75::1::1424380312
341 | 6::77::1::1424380312
342 | 6::79::1::1424380312
343 | 6::81::1::1424380312
344 | 6::84::1::1424380312
345 | 6::85::3::1424380312
346 | 6::86::1::1424380312
347 | 6::87::1::1424380312
348 | 6::88::1::1424380312
349 | 6::89::1::1424380312
350 | 6::91::2::1424380312
351 | 6::94::1::1424380312
352 | 6::95::2::1424380312
353 | 6::96::1::1424380312
354 | 7::1::1::1424380312
355 | 7::2::2::1424380312
356 | 7::3::1::1424380312
357 | 7::4::1::1424380312
358 | 7::7::1::1424380312
359 | 7::10::1::1424380312
360 | 7::11::2::1424380312
361 | 7::14::2::1424380312
362 | 7::15::1::1424380312
363 | 7::16::1::1424380312
364 | 7::18::1::1424380312
365 | 7::21::1::1424380312
366 | 7::22::1::1424380312
367 | 7::23::1::1424380312
368 | 7::25::5::1424380312
369 | 7::26::1::1424380312
370 | 7::29::4::1424380312
371 | 7::30::1::1424380312
372 | 7::31::3::1424380312
373 | 7::32::1::1424380312
374 | 7::33::1::1424380312
375 | 7::35::1::1424380312
376 | 7::37::2::1424380312
377 | 7::39::3::1424380312
378 | 7::40::2::1424380312
379 | 7::42::2::1424380312
380 | 7::44::1::1424380312
381 | 7::45::2::1424380312
382 | 7::47::4::1424380312
383 | 7::48::1::1424380312
384 | 7::49::1::1424380312
385 | 7::53::1::1424380312
386 | 7::54::1::1424380312
387 | 7::55::1::1424380312
388 | 7::56::1::1424380312
389 | 7::59::1::1424380312
390 | 7::61::2::1424380312
391 | 7::62::3::1424380312
392 | 7::63::2::1424380312
393 | 7::66::1::1424380312
394 | 7::67::3::1424380312
395 | 7::74::1::1424380312
396 | 7::75::1::1424380312
397 | 7::76::3::1424380312
398 | 7::77::1::1424380312
399 | 7::81::1::1424380312
400 | 7::82::1::1424380312
401 | 7::84::2::1424380312
402 | 7::85::4::1424380312
403 | 7::86::1::1424380312
404 | 7::92::2::1424380312
405 | 7::96::1::1424380312
406 | 7::97::1::1424380312
407 | 7::98::1::1424380312
408 | 8::0::1::1424380312
409 | 8::2::4::1424380312
410 | 8::3::2::1424380312
411 | 8::4::2::1424380312
412 | 8::5::1::1424380312
413 | 8::7::1::1424380312
414 | 8::9::1::1424380312
415 | 8::11::1::1424380312
416 | 8::15::1::1424380312
417 | 8::18::1::1424380312
418 | 8::19::1::1424380312
419 | 8::21::1::1424380312
420 | 8::29::5::1424380312
421 | 8::31::3::1424380312
422 | 8::33::1::1424380312
423 | 8::35::1::1424380312
424 | 8::36::1::1424380312
425 | 8::40::2::1424380312
426 | 8::44::1::1424380312
427 | 8::45::1::1424380312
428 | 8::50::1::1424380312
429 | 8::51::1::1424380312
430 | 8::52::5::1424380312
431 | 8::53::5::1424380312
432 | 8::54::1::1424380312
433 | 8::55::1::1424380312
434 | 8::56::1::1424380312
435 | 8::58::4::1424380312
436 | 8::60::3::1424380312
437 | 8::62::4::1424380312
438 | 8::64::1::1424380312
439 | 8::67::3::1424380312
440 | 8::69::1::1424380312
441 | 8::71::1::1424380312
442 | 8::72::3::1424380312
443 | 8::77::3::1424380312
444 | 8::78::1::1424380312
445 | 8::79::1::1424380312
446 | 8::83::1::1424380312
447 | 8::85::5::1424380312
448 | 8::86::1::1424380312
449 | 8::88::1::1424380312
450 | 8::90::1::1424380312
451 | 8::92::2::1424380312
452 | 8::95::4::1424380312
453 | 8::96::3::1424380312
454 | 8::97::1::1424380312
455 | 8::98::1::1424380312
456 | 8::99::1::1424380312
457 | 9::2::3::1424380312
458 | 9::3::1::1424380312
459 | 9::4::1::1424380312
460 | 9::5::1::1424380312
461 | 9::6::1::1424380312
462 | 9::7::5::1424380312
463 | 9::9::1::1424380312
464 | 9::12::1::1424380312
465 | 9::14::3::1424380312
466 | 9::15::1::1424380312
467 | 9::19::1::1424380312
468 | 9::21::1::1424380312
469 | 9::22::1::1424380312
470 | 9::24::1::1424380312
471 | 9::25::1::1424380312
472 | 9::26::1::1424380312
473 | 9::30::3::1424380312
474 | 9::32::4::1424380312
475 | 9::35::2::1424380312
476 | 9::36::2::1424380312
477 | 9::37::2::1424380312
478 | 9::38::1::1424380312
479 | 9::39::1::1424380312
480 | 9::43::3::1424380312
481 | 9::49::5::1424380312
482 | 9::50::3::1424380312
483 | 9::53::1::1424380312
484 | 9::54::1::1424380312
485 | 9::58::1::1424380312
486 | 9::59::1::1424380312
487 | 9::60::1::1424380312
488 | 9::61::1::1424380312
489 | 9::63::3::1424380312
490 | 9::64::3::1424380312
491 | 9::68::1::1424380312
492 | 9::69::1::1424380312
493 | 9::70::3::1424380312
494 | 9::71::1::1424380312
495 | 9::73::2::1424380312
496 | 9::75::1::1424380312
497 | 9::77::2::1424380312
498 | 9::81::2::1424380312
499 | 9::82::1::1424380312
500 | 9::83::1::1424380312
501 | 9::84::1::1424380312
502 | 9::86::1::1424380312
503 | 9::87::4::1424380312
504 | 9::88::1::1424380312
505 | 9::90::3::1424380312
506 | 9::94::2::1424380312
507 | 9::95::3::1424380312
508 | 9::97::2::1424380312
509 | 9::98::1::1424380312
510 | 10::0::3::1424380312
511 | 10::2::4::1424380312
512 | 10::4::3::1424380312
513 | 10::7::1::1424380312
514 | 10::8::1::1424380312
515 | 10::10::1::1424380312
516 | 10::13::2::1424380312
517 | 10::14::1::1424380312
518 | 10::16::2::1424380312
519 | 10::17::1::1424380312
520 | 10::18::1::1424380312
521 | 10::21::1::1424380312
522 | 10::22::1::1424380312
523 | 10::24::1::1424380312
524 | 10::25::3::1424380312
525 | 10::28::1::1424380312
526 | 10::35::1::1424380312
527 | 10::36::1::1424380312
528 | 10::37::1::1424380312
529 | 10::38::1::1424380312
530 | 10::39::1::1424380312
531 | 10::40::4::1424380312
532 | 10::41::2::1424380312
533 | 10::42::3::1424380312
534 | 10::43::1::1424380312
535 | 10::49::3::1424380312
536 | 10::50::1::1424380312
537 | 10::51::1::1424380312
538 | 10::52::1::1424380312
539 | 10::55::2::1424380312
540 | 10::56::1::1424380312
541 | 10::58::1::1424380312
542 | 10::63::1::1424380312
543 | 10::66::1::1424380312
544 | 10::67::2::1424380312
545 | 10::68::1::1424380312
546 | 10::75::1::1424380312
547 | 10::77::1::1424380312
548 | 10::79::1::1424380312
549 | 10::86::1::1424380312
550 | 10::89::3::1424380312
551 | 10::90::1::1424380312
552 | 10::97::1::1424380312
553 | 10::98::1::1424380312
554 | 11::0::1::1424380312
555 | 11::6::2::1424380312
556 | 11::9::1::1424380312
557 | 11::10::1::1424380312
558 | 11::11::1::1424380312
559 | 11::12::1::1424380312
560 | 11::13::4::1424380312
561 | 11::16::1::1424380312
562 | 11::18::5::1424380312
563 | 11::19::4::1424380312
564 | 11::20::1::1424380312
565 | 11::21::1::1424380312
566 | 11::22::1::1424380312
567 | 11::23::5::1424380312
568 | 11::25::1::1424380312
569 | 11::27::5::1424380312
570 | 11::30::5::1424380312
571 | 11::32::5::1424380312
572 | 11::35::3::1424380312
573 | 11::36::2::1424380312
574 | 11::37::2::1424380312
575 | 11::38::4::1424380312
576 | 11::39::1::1424380312
577 | 11::40::1::1424380312
578 | 11::41::1::1424380312
579 | 11::43::2::1424380312
580 | 11::45::1::1424380312
581 | 11::47::1::1424380312
582 | 11::48::5::1424380312
583 | 11::50::4::1424380312
584 | 11::51::3::1424380312
585 | 11::59::1::1424380312
586 | 11::61::1::1424380312
587 | 11::62::1::1424380312
588 | 11::64::1::1424380312
589 | 11::66::4::1424380312
590 | 11::67::1::1424380312
591 | 11::69::5::1424380312
592 | 11::70::1::1424380312
593 | 11::71::3::1424380312
594 | 11::72::3::1424380312
595 | 11::75::3::1424380312
596 | 11::76::1::1424380312
597 | 11::77::1::1424380312
598 | 11::78::1::1424380312
599 | 11::79::5::1424380312
600 | 11::80::3::1424380312
601 | 11::81::4::1424380312
602 | 11::82::1::1424380312
603 | 11::86::1::1424380312
604 | 11::88::1::1424380312
605 | 11::89::1::1424380312
606 | 11::90::4::1424380312
607 | 11::94::2::1424380312
608 | 11::97::3::1424380312
609 | 11::99::1::1424380312
610 | 12::2::1::1424380312
611 | 12::4::1::1424380312
612 | 12::6::1::1424380312
613 | 12::7::3::1424380312
614 | 12::8::1::1424380312
615 | 12::14::1::1424380312
616 | 12::15::2::1424380312
617 | 12::16::4::1424380312
618 | 12::17::5::1424380312
619 | 12::18::2::1424380312
620 | 12::21::1::1424380312
621 | 12::22::2::1424380312
622 | 12::23::3::1424380312
623 | 12::24::1::1424380312
624 | 12::25::1::1424380312
625 | 12::27::5::1424380312
626 | 12::30::2::1424380312
627 | 12::31::4::1424380312
628 | 12::35::5::1424380312
629 | 12::38::1::1424380312
630 | 12::41::1::1424380312
631 | 12::44::2::1424380312
632 | 12::45::1::1424380312
633 | 12::50::4::1424380312
634 | 12::51::1::1424380312
635 | 12::52::1::1424380312
636 | 12::53::1::1424380312
637 | 12::54::1::1424380312
638 | 12::56::2::1424380312
639 | 12::57::1::1424380312
640 | 12::60::1::1424380312
641 | 12::63::1::1424380312
642 | 12::64::5::1424380312
643 | 12::66::3::1424380312
644 | 12::67::1::1424380312
645 | 12::70::1::1424380312
646 | 12::72::1::1424380312
647 | 12::74::1::1424380312
648 | 12::75::1::1424380312
649 | 12::77::1::1424380312
650 | 12::78::1::1424380312
651 | 12::79::3::1424380312
652 | 12::82::2::1424380312
653 | 12::83::1::1424380312
654 | 12::84::1::1424380312
655 | 12::85::1::1424380312
656 | 12::86::1::1424380312
657 | 12::87::1::1424380312
658 | 12::88::1::1424380312
659 | 12::91::3::1424380312
660 | 12::92::1::1424380312
661 | 12::94::4::1424380312
662 | 12::95::2::1424380312
663 | 12::96::1::1424380312
664 | 12::98::2::1424380312
665 | 13::0::1::1424380312
666 | 13::3::1::1424380312
667 | 13::4::2::1424380312
668 | 13::5::1::1424380312
669 | 13::6::1::1424380312
670 | 13::12::1::1424380312
671 | 13::14::2::1424380312
672 | 13::15::1::1424380312
673 | 13::17::1::1424380312
674 | 13::18::3::1424380312
675 | 13::20::1::1424380312
676 | 13::21::1::1424380312
677 | 13::22::1::1424380312
678 | 13::26::1::1424380312
679 | 13::27::1::1424380312
680 | 13::29::3::1424380312
681 | 13::31::1::1424380312
682 | 13::33::1::1424380312
683 | 13::40::2::1424380312
684 | 13::43::2::1424380312
685 | 13::44::1::1424380312
686 | 13::45::1::1424380312
687 | 13::49::1::1424380312
688 | 13::51::1::1424380312
689 | 13::52::2::1424380312
690 | 13::53::3::1424380312
691 | 13::54::1::1424380312
692 | 13::62::1::1424380312
693 | 13::63::2::1424380312
694 | 13::64::1::1424380312
695 | 13::68::1::1424380312
696 | 13::71::1::1424380312
697 | 13::72::3::1424380312
698 | 13::73::1::1424380312
699 | 13::74::3::1424380312
700 | 13::77::2::1424380312
701 | 13::78::1::1424380312
702 | 13::79::2::1424380312
703 | 13::83::3::1424380312
704 | 13::85::1::1424380312
705 | 13::86::1::1424380312
706 | 13::87::2::1424380312
707 | 13::88::2::1424380312
708 | 13::90::1::1424380312
709 | 13::93::4::1424380312
710 | 13::94::1::1424380312
711 | 13::98::1::1424380312
712 | 13::99::1::1424380312
713 | 14::1::1::1424380312
714 | 14::3::3::1424380312
715 | 14::4::1::1424380312
716 | 14::5::1::1424380312
717 | 14::6::1::1424380312
718 | 14::7::1::1424380312
719 | 14::9::1::1424380312
720 | 14::10::1::1424380312
721 | 14::11::1::1424380312
722 | 14::12::1::1424380312
723 | 14::13::1::1424380312
724 | 14::14::3::1424380312
725 | 14::15::1::1424380312
726 | 14::16::1::1424380312
727 | 14::17::1::1424380312
728 | 14::20::1::1424380312
729 | 14::21::1::1424380312
730 | 14::24::1::1424380312
731 | 14::25::2::1424380312
732 | 14::27::1::1424380312
733 | 14::28::1::1424380312
734 | 14::29::5::1424380312
735 | 14::31::3::1424380312
736 | 14::34::1::1424380312
737 | 14::36::1::1424380312
738 | 14::37::2::1424380312
739 | 14::39::2::1424380312
740 | 14::40::1::1424380312
741 | 14::44::1::1424380312
742 | 14::45::1::1424380312
743 | 14::47::3::1424380312
744 | 14::48::1::1424380312
745 | 14::49::1::1424380312
746 | 14::51::1::1424380312
747 | 14::52::5::1424380312
748 | 14::53::3::1424380312
749 | 14::54::1::1424380312
750 | 14::55::1::1424380312
751 | 14::56::1::1424380312
752 | 14::62::4::1424380312
753 | 14::63::5::1424380312
754 | 14::67::3::1424380312
755 | 14::68::1::1424380312
756 | 14::69::3::1424380312
757 | 14::71::1::1424380312
758 | 14::72::4::1424380312
759 | 14::73::1::1424380312
760 | 14::76::5::1424380312
761 | 14::79::1::1424380312
762 | 14::82::1::1424380312
763 | 14::83::1::1424380312
764 | 14::88::1::1424380312
765 | 14::93::3::1424380312
766 | 14::94::1::1424380312
767 | 14::95::2::1424380312
768 | 14::96::4::1424380312
769 | 14::98::1::1424380312
770 | 15::0::1::1424380312
771 | 15::1::4::1424380312
772 | 15::2::1::1424380312
773 | 15::5::2::1424380312
774 | 15::6::1::1424380312
775 | 15::7::1::1424380312
776 | 15::13::1::1424380312
777 | 15::14::1::1424380312
778 | 15::15::1::1424380312
779 | 15::17::2::1424380312
780 | 15::19::2::1424380312
781 | 15::22::2::1424380312
782 | 15::23::2::1424380312
783 | 15::25::1::1424380312
784 | 15::26::3::1424380312
785 | 15::27::1::1424380312
786 | 15::28::2::1424380312
787 | 15::29::1::1424380312
788 | 15::32::1::1424380312
789 | 15::33::2::1424380312
790 | 15::34::1::1424380312
791 | 15::35::2::1424380312
792 | 15::36::1::1424380312
793 | 15::37::1::1424380312
794 | 15::39::1::1424380312
795 | 15::42::1::1424380312
796 | 15::46::5::1424380312
797 | 15::48::2::1424380312
798 | 15::50::2::1424380312
799 | 15::51::1::1424380312
800 | 15::52::1::1424380312
801 | 15::58::1::1424380312
802 | 15::62::1::1424380312
803 | 15::64::3::1424380312
804 | 15::65::2::1424380312
805 | 15::72::1::1424380312
806 | 15::73::1::1424380312
807 | 15::74::1::1424380312
808 | 15::79::1::1424380312
809 | 15::80::1::1424380312
810 | 15::81::1::1424380312
811 | 15::82::2::1424380312
812 | 15::85::1::1424380312
813 | 15::87::1::1424380312
814 | 15::91::2::1424380312
815 | 15::96::1::1424380312
816 | 15::97::1::1424380312
817 | 15::98::3::1424380312
818 | 16::2::1::1424380312
819 | 16::5::3::1424380312
820 | 16::6::2::1424380312
821 | 16::7::1::1424380312
822 | 16::9::1::1424380312
823 | 16::12::1::1424380312
824 | 16::14::1::1424380312
825 | 16::15::1::1424380312
826 | 16::19::1::1424380312
827 | 16::21::2::1424380312
828 | 16::29::4::1424380312
829 | 16::30::2::1424380312
830 | 16::32::1::1424380312
831 | 16::34::1::1424380312
832 | 16::36::1::1424380312
833 | 16::38::1::1424380312
834 | 16::46::1::1424380312
835 | 16::47::3::1424380312
836 | 16::48::1::1424380312
837 | 16::49::1::1424380312
838 | 16::50::1::1424380312
839 | 16::51::5::1424380312
840 | 16::54::5::1424380312
841 | 16::55::1::1424380312
842 | 16::56::2::1424380312
843 | 16::57::1::1424380312
844 | 16::60::1::1424380312
845 | 16::63::2::1424380312
846 | 16::65::1::1424380312
847 | 16::67::1::1424380312
848 | 16::72::1::1424380312
849 | 16::74::1::1424380312
850 | 16::80::1::1424380312
851 | 16::81::1::1424380312
852 | 16::82::1::1424380312
853 | 16::85::5::1424380312
854 | 16::86::1::1424380312
855 | 16::90::5::1424380312
856 | 16::91::1::1424380312
857 | 16::93::1::1424380312
858 | 16::94::3::1424380312
859 | 16::95::2::1424380312
860 | 16::96::3::1424380312
861 | 16::98::3::1424380312
862 | 16::99::1::1424380312
863 | 17::2::1::1424380312
864 | 17::3::1::1424380312
865 | 17::6::1::1424380312
866 | 17::10::4::1424380312
867 | 17::11::1::1424380312
868 | 17::13::2::1424380312
869 | 17::17::5::1424380312
870 | 17::19::1::1424380312
871 | 17::20::5::1424380312
872 | 17::22::4::1424380312
873 | 17::28::1::1424380312
874 | 17::29::1::1424380312
875 | 17::33::1::1424380312
876 | 17::34::1::1424380312
877 | 17::35::2::1424380312
878 | 17::37::1::1424380312
879 | 17::38::1::1424380312
880 | 17::45::1::1424380312
881 | 17::46::5::1424380312
882 | 17::47::1::1424380312
883 | 17::49::3::1424380312
884 | 17::51::1::1424380312
885 | 17::55::5::1424380312
886 | 17::56::3::1424380312
887 | 17::57::1::1424380312
888 | 17::58::1::1424380312
889 | 17::59::1::1424380312
890 | 17::60::1::1424380312
891 | 17::63::1::1424380312
892 | 17::66::1::1424380312
893 | 17::68::4::1424380312
894 | 17::69::1::1424380312
895 | 17::70::1::1424380312
896 | 17::72::1::1424380312
897 | 17::73::3::1424380312
898 | 17::78::1::1424380312
899 | 17::79::1::1424380312
900 | 17::82::2::1424380312
901 | 17::84::1::1424380312
902 | 17::90::5::1424380312
903 | 17::91::3::1424380312
904 | 17::92::1::1424380312
905 | 17::93::1::1424380312
906 | 17::94::4::1424380312
907 | 17::95::2::1424380312
908 | 17::97::1::1424380312
909 | 18::1::1::1424380312
910 | 18::4::3::1424380312
911 | 18::5::2::1424380312
912 | 18::6::1::1424380312
913 | 18::7::1::1424380312
914 | 18::10::1::1424380312
915 | 18::11::4::1424380312
916 | 18::12::2::1424380312
917 | 18::13::1::1424380312
918 | 18::15::1::1424380312
919 | 18::18::1::1424380312
920 | 18::20::1::1424380312
921 | 18::21::2::1424380312
922 | 18::22::1::1424380312
923 | 18::23::2::1424380312
924 | 18::25::1::1424380312
925 | 18::26::1::1424380312
926 | 18::27::1::1424380312
927 | 18::28::5::1424380312
928 | 18::29::1::1424380312
929 | 18::31::1::1424380312
930 | 18::32::1::1424380312
931 | 18::36::1::1424380312
932 | 18::38::5::1424380312
933 | 18::39::5::1424380312
934 | 18::40::1::1424380312
935 | 18::42::1::1424380312
936 | 18::43::1::1424380312
937 | 18::44::4::1424380312
938 | 18::46::1::1424380312
939 | 18::47::1::1424380312
940 | 18::48::1::1424380312
941 | 18::51::2::1424380312
942 | 18::55::1::1424380312
943 | 18::56::1::1424380312
944 | 18::57::1::1424380312
945 | 18::62::1::1424380312
946 | 18::63::1::1424380312
947 | 18::66::3::1424380312
948 | 18::67::1::1424380312
949 | 18::70::1::1424380312
950 | 18::75::1::1424380312
951 | 18::76::3::1424380312
952 | 18::77::1::1424380312
953 | 18::80::3::1424380312
954 | 18::81::3::1424380312
955 | 18::82::1::1424380312
956 | 18::83::5::1424380312
957 | 18::84::1::1424380312
958 | 18::97::1::1424380312
959 | 18::98::1::1424380312
960 | 18::99::2::1424380312
961 | 19::0::1::1424380312
962 | 19::1::1::1424380312
963 | 19::2::1::1424380312
964 | 19::4::1::1424380312
965 | 19::6::2::1424380312
966 | 19::11::1::1424380312
967 | 19::12::1::1424380312
968 | 19::14::1::1424380312
969 | 19::23::1::1424380312
970 | 19::26::1::1424380312
971 | 19::31::1::1424380312
972 | 19::32::4::1424380312
973 | 19::33::1::1424380312
974 | 19::34::1::1424380312
975 | 19::37::1::1424380312
976 | 19::38::1::1424380312
977 | 19::41::1::1424380312
978 | 19::43::1::1424380312
979 | 19::45::1::1424380312
980 | 19::48::1::1424380312
981 | 19::49::1::1424380312
982 | 19::50::2::1424380312
983 | 19::53::2::1424380312
984 | 19::54::3::1424380312
985 | 19::55::1::1424380312
986 | 19::56::2::1424380312
987 | 19::58::1::1424380312
988 | 19::61::1::1424380312
989 | 19::62::1::1424380312
990 | 19::63::1::1424380312
991 | 19::64::1::1424380312
992 | 19::65::1::1424380312
993 | 19::69::2::1424380312
994 | 19::72::1::1424380312
995 | 19::74::3::1424380312
996 | 19::76::1::1424380312
997 | 19::78::1::1424380312
998 | 19::79::1::1424380312
999 | 19::81::1::1424380312
1000 | 19::82::1::1424380312
1001 | 19::84::1::1424380312
1002 | 19::86::1::1424380312
1003 | 19::87::2::1424380312
1004 | 19::90::4::1424380312
1005 | 19::93::1::1424380312
1006 | 19::94::4::1424380312
1007 | 19::95::2::1424380312
1008 | 19::96::1::1424380312
1009 | 19::98::4::1424380312
1010 | 20::0::1::1424380312
1011 | 20::1::1::1424380312
1012 | 20::2::2::1424380312
1013 | 20::4::2::1424380312
1014 | 20::6::1::1424380312
1015 | 20::8::1::1424380312
1016 | 20::12::1::1424380312
1017 | 20::21::2::1424380312
1018 | 20::22::5::1424380312
1019 | 20::24::2::1424380312
1020 | 20::25::1::1424380312
1021 | 20::26::1::1424380312
1022 | 20::29::2::1424380312
1023 | 20::30::2::1424380312
1024 | 20::32::2::1424380312
1025 | 20::39::1::1424380312
1026 | 20::40::1::1424380312
1027 | 20::41::2::1424380312
1028 | 20::45::2::1424380312
1029 | 20::48::1::1424380312
1030 | 20::50::1::1424380312
1031 | 20::51::3::1424380312
1032 | 20::53::3::1424380312
1033 | 20::55::1::1424380312
1034 | 20::57::2::1424380312
1035 | 20::60::1::1424380312
1036 | 20::61::1::1424380312
1037 | 20::64::1::1424380312
1038 | 20::66::1::1424380312
1039 | 20::70::2::1424380312
1040 | 20::72::1::1424380312
1041 | 20::73::2::1424380312
1042 | 20::75::4::1424380312
1043 | 20::76::1::1424380312
1044 | 20::77::4::1424380312
1045 | 20::78::1::1424380312
1046 | 20::79::1::1424380312
1047 | 20::84::2::1424380312
1048 | 20::85::2::1424380312
1049 | 20::88::3::1424380312
1050 | 20::89::1::1424380312
1051 | 20::90::3::1424380312
1052 | 20::91::1::1424380312
1053 | 20::92::2::1424380312
1054 | 20::93::1::1424380312
1055 | 20::94::4::1424380312
1056 | 20::97::1::1424380312
1057 | 21::0::1::1424380312
1058 | 21::2::4::1424380312
1059 | 21::3::1::1424380312
1060 | 21::7::2::1424380312
1061 | 21::11::1::1424380312
1062 | 21::12::1::1424380312
1063 | 21::13::1::1424380312
1064 | 21::14::3::1424380312
1065 | 21::17::1::1424380312
1066 | 21::19::1::1424380312
1067 | 21::20::1::1424380312
1068 | 21::21::1::1424380312
1069 | 21::22::1::1424380312
1070 | 21::23::1::1424380312
1071 | 21::24::1::1424380312
1072 | 21::27::1::1424380312
1073 | 21::29::5::1424380312
1074 | 21::30::2::1424380312
1075 | 21::38::1::1424380312
1076 | 21::40::2::1424380312
1077 | 21::43::3::1424380312
1078 | 21::44::1::1424380312
1079 | 21::45::1::1424380312
1080 | 21::46::1::1424380312
1081 | 21::48::1::1424380312
1082 | 21::51::1::1424380312
1083 | 21::53::5::1424380312
1084 | 21::54::1::1424380312
1085 | 21::55::1::1424380312
1086 | 21::56::1::1424380312
1087 | 21::58::3::1424380312
1088 | 21::59::3::1424380312
1089 | 21::64::1::1424380312
1090 | 21::66::1::1424380312
1091 | 21::68::1::1424380312
1092 | 21::71::1::1424380312
1093 | 21::73::1::1424380312
1094 | 21::74::4::1424380312
1095 | 21::80::1::1424380312
1096 | 21::81::1::1424380312
1097 | 21::83::1::1424380312
1098 | 21::84::1::1424380312
1099 | 21::85::3::1424380312
1100 | 21::87::4::1424380312
1101 | 21::89::2::1424380312
1102 | 21::92::2::1424380312
1103 | 21::96::3::1424380312
1104 | 21::99::1::1424380312
1105 | 22::0::1::1424380312
1106 | 22::3::2::1424380312
1107 | 22::5::2::1424380312
1108 | 22::6::2::1424380312
1109 | 22::9::1::1424380312
1110 | 22::10::1::1424380312
1111 | 22::11::1::1424380312
1112 | 22::13::1::1424380312
1113 | 22::14::1::1424380312
1114 | 22::16::1::1424380312
1115 | 22::18::3::1424380312
1116 | 22::19::1::1424380312
1117 | 22::22::5::1424380312
1118 | 22::25::1::1424380312
1119 | 22::26::1::1424380312
1120 | 22::29::3::1424380312
1121 | 22::30::5::1424380312
1122 | 22::32::4::1424380312
1123 | 22::33::1::1424380312
1124 | 22::35::1::1424380312
1125 | 22::36::3::1424380312
1126 | 22::37::1::1424380312
1127 | 22::40::1::1424380312
1128 | 22::41::3::1424380312
1129 | 22::44::1::1424380312
1130 | 22::45::2::1424380312
1131 | 22::48::1::1424380312
1132 | 22::51::5::1424380312
1133 | 22::55::1::1424380312
1134 | 22::56::2::1424380312
1135 | 22::60::3::1424380312
1136 | 22::61::1::1424380312
1137 | 22::62::4::1424380312
1138 | 22::63::1::1424380312
1139 | 22::65::1::1424380312
1140 | 22::66::1::1424380312
1141 | 22::68::4::1424380312
1142 | 22::69::4::1424380312
1143 | 22::70::3::1424380312
1144 | 22::71::1::1424380312
1145 | 22::74::5::1424380312
1146 | 22::75::5::1424380312
1147 | 22::78::1::1424380312
1148 | 22::80::3::1424380312
1149 | 22::81::1::1424380312
1150 | 22::82::1::1424380312
1151 | 22::84::1::1424380312
1152 | 22::86::1::1424380312
1153 | 22::87::3::1424380312
1154 | 22::88::5::1424380312
1155 | 22::90::2::1424380312
1156 | 22::92::3::1424380312
1157 | 22::95::2::1424380312
1158 | 22::96::2::1424380312
1159 | 22::98::4::1424380312
1160 | 22::99::1::1424380312
1161 | 23::0::1::1424380312
1162 | 23::2::1::1424380312
1163 | 23::4::1::1424380312
1164 | 23::6::2::1424380312
1165 | 23::10::4::1424380312
1166 | 23::12::1::1424380312
1167 | 23::13::4::1424380312
1168 | 23::14::1::1424380312
1169 | 23::15::1::1424380312
1170 | 23::18::4::1424380312
1171 | 23::22::2::1424380312
1172 | 23::23::4::1424380312
1173 | 23::24::1::1424380312
1174 | 23::25::1::1424380312
1175 | 23::26::1::1424380312
1176 | 23::27::5::1424380312
1177 | 23::28::1::1424380312
1178 | 23::29::1::1424380312
1179 | 23::30::4::1424380312
1180 | 23::32::5::1424380312
1181 | 23::33::2::1424380312
1182 | 23::36::3::1424380312
1183 | 23::37::1::1424380312
1184 | 23::38::1::1424380312
1185 | 23::39::1::1424380312
1186 | 23::43::1::1424380312
1187 | 23::48::5::1424380312
1188 | 23::49::5::1424380312
1189 | 23::50::4::1424380312
1190 | 23::53::1::1424380312
1191 | 23::55::5::1424380312
1192 | 23::57::1::1424380312
1193 | 23::59::1::1424380312
1194 | 23::60::1::1424380312
1195 | 23::61::1::1424380312
1196 | 23::64::4::1424380312
1197 | 23::65::5::1424380312
1198 | 23::66::2::1424380312
1199 | 23::67::1::1424380312
1200 | 23::68::3::1424380312
1201 | 23::69::1::1424380312
1202 | 23::72::1::1424380312
1203 | 23::73::3::1424380312
1204 | 23::77::1::1424380312
1205 | 23::82::2::1424380312
1206 | 23::83::1::1424380312
1207 | 23::84::1::1424380312
1208 | 23::85::1::1424380312
1209 | 23::87::3::1424380312
1210 | 23::88::1::1424380312
1211 | 23::95::2::1424380312
1212 | 23::97::1::1424380312
1213 | 24::4::1::1424380312
1214 | 24::6::3::1424380312
1215 | 24::7::1::1424380312
1216 | 24::10::2::1424380312
1217 | 24::12::1::1424380312
1218 | 24::15::1::1424380312
1219 | 24::19::1::1424380312
1220 | 24::24::1::1424380312
1221 | 24::27::3::1424380312
1222 | 24::30::5::1424380312
1223 | 24::31::1::1424380312
1224 | 24::32::3::1424380312
1225 | 24::33::1::1424380312
1226 | 24::37::1::1424380312
1227 | 24::39::1::1424380312
1228 | 24::40::1::1424380312
1229 | 24::42::1::1424380312
1230 | 24::43::3::1424380312
1231 | 24::45::2::1424380312
1232 | 24::46::1::1424380312
1233 | 24::47::1::1424380312
1234 | 24::48::1::1424380312
1235 | 24::49::1::1424380312
1236 | 24::50::1::1424380312
1237 | 24::52::5::1424380312
1238 | 24::57::1::1424380312
1239 | 24::59::4::1424380312
1240 | 24::63::4::1424380312
1241 | 24::65::1::1424380312
1242 | 24::66::1::1424380312
1243 | 24::67::1::1424380312
1244 | 24::68::3::1424380312
1245 | 24::69::5::1424380312
1246 | 24::71::1::1424380312
1247 | 24::72::4::1424380312
1248 | 24::77::4::1424380312
1249 | 24::78::1::1424380312
1250 | 24::80::1::1424380312
1251 | 24::82::1::1424380312
1252 | 24::84::1::1424380312
1253 | 24::86::1::1424380312
1254 | 24::87::1::1424380312
1255 | 24::88::2::1424380312
1256 | 24::89::1::1424380312
1257 | 24::90::5::1424380312
1258 | 24::91::1::1424380312
1259 | 24::92::1::1424380312
1260 | 24::94::2::1424380312
1261 | 24::95::1::1424380312
1262 | 24::96::5::1424380312
1263 | 24::98::1::1424380312
1264 | 24::99::1::1424380312
1265 | 25::1::3::1424380312
1266 | 25::2::1::1424380312
1267 | 25::7::1::1424380312
1268 | 25::9::1::1424380312
1269 | 25::12::3::1424380312
1270 | 25::16::3::1424380312
1271 | 25::17::1::1424380312
1272 | 25::18::1::1424380312
1273 | 25::20::1::1424380312
1274 | 25::22::1::1424380312
1275 | 25::23::1::1424380312
1276 | 25::26::2::1424380312
1277 | 25::29::1::1424380312
1278 | 25::30::1::1424380312
1279 | 25::31::2::1424380312
1280 | 25::33::4::1424380312
1281 | 25::34::3::1424380312
1282 | 25::35::2::1424380312
1283 | 25::36::1::1424380312
1284 | 25::37::1::1424380312
1285 | 25::40::1::1424380312
1286 | 25::41::1::1424380312
1287 | 25::43::1::1424380312
1288 | 25::47::4::1424380312
1289 | 25::50::1::1424380312
1290 | 25::51::1::1424380312
1291 | 25::53::1::1424380312
1292 | 25::56::1::1424380312
1293 | 25::58::2::1424380312
1294 | 25::64::2::1424380312
1295 | 25::67::2::1424380312
1296 | 25::68::1::1424380312
1297 | 25::70::1::1424380312
1298 | 25::71::4::1424380312
1299 | 25::73::1::1424380312
1300 | 25::74::1::1424380312
1301 | 25::76::1::1424380312
1302 | 25::79::1::1424380312
1303 | 25::82::1::1424380312
1304 | 25::84::2::1424380312
1305 | 25::85::1::1424380312
1306 | 25::91::3::1424380312
1307 | 25::92::1::1424380312
1308 | 25::94::1::1424380312
1309 | 25::95::1::1424380312
1310 | 25::97::2::1424380312
1311 | 26::0::1::1424380312
1312 | 26::1::1::1424380312
1313 | 26::2::1::1424380312
1314 | 26::3::1::1424380312
1315 | 26::4::4::1424380312
1316 | 26::5::2::1424380312
1317 | 26::6::3::1424380312
1318 | 26::7::5::1424380312
1319 | 26::13::3::1424380312
1320 | 26::14::1::1424380312
1321 | 26::16::1::1424380312
1322 | 26::18::3::1424380312
1323 | 26::20::1::1424380312
1324 | 26::21::3::1424380312
1325 | 26::22::5::1424380312
1326 | 26::23::5::1424380312
1327 | 26::24::5::1424380312
1328 | 26::27::1::1424380312
1329 | 26::31::1::1424380312
1330 | 26::35::1::1424380312
1331 | 26::36::4::1424380312
1332 | 26::40::1::1424380312
1333 | 26::44::1::1424380312
1334 | 26::45::2::1424380312
1335 | 26::47::1::1424380312
1336 | 26::48::1::1424380312
1337 | 26::49::3::1424380312
1338 | 26::50::2::1424380312
1339 | 26::52::1::1424380312
1340 | 26::54::4::1424380312
1341 | 26::55::1::1424380312
1342 | 26::57::3::1424380312
1343 | 26::58::1::1424380312
1344 | 26::61::1::1424380312
1345 | 26::62::2::1424380312
1346 | 26::66::1::1424380312
1347 | 26::68::4::1424380312
1348 | 26::71::1::1424380312
1349 | 26::73::4::1424380312
1350 | 26::76::1::1424380312
1351 | 26::81::3::1424380312
1352 | 26::85::1::1424380312
1353 | 26::86::3::1424380312
1354 | 26::88::5::1424380312
1355 | 26::91::1::1424380312
1356 | 26::94::5::1424380312
1357 | 26::95::1::1424380312
1358 | 26::96::1::1424380312
1359 | 26::97::1::1424380312
1360 | 27::0::1::1424380312
1361 | 27::9::1::1424380312
1362 | 27::10::1::1424380312
1363 | 27::18::4::1424380312
1364 | 27::19::3::1424380312
1365 | 27::20::1::1424380312
1366 | 27::22::2::1424380312
1367 | 27::24::2::1424380312
1368 | 27::25::1::1424380312
1369 | 27::27::3::1424380312
1370 | 27::28::1::1424380312
1371 | 27::29::1::1424380312
1372 | 27::31::1::1424380312
1373 | 27::33::3::1424380312
1374 | 27::40::1::1424380312
1375 | 27::42::1::1424380312
1376 | 27::43::1::1424380312
1377 | 27::44::3::1424380312
1378 | 27::45::1::1424380312
1379 | 27::51::3::1424380312
1380 | 27::52::1::1424380312
1381 | 27::55::3::1424380312
1382 | 27::57::1::1424380312
1383 | 27::59::1::1424380312
1384 | 27::60::1::1424380312
1385 | 27::61::1::1424380312
1386 | 27::64::1::1424380312
1387 | 27::66::3::1424380312
1388 | 27::68::1::1424380312
1389 | 27::70::1::1424380312
1390 | 27::71::2::1424380312
1391 | 27::72::1::1424380312
1392 | 27::75::3::1424380312
1393 | 27::78::1::1424380312
1394 | 27::80::3::1424380312
1395 | 27::82::1::1424380312
1396 | 27::83::3::1424380312
1397 | 27::86::1::1424380312
1398 | 27::87::2::1424380312
1399 | 27::90::1::1424380312
1400 | 27::91::1::1424380312
1401 | 27::92::1::1424380312
1402 | 27::93::1::1424380312
1403 | 27::94::2::1424380312
1404 | 27::95::1::1424380312
1405 | 27::98::1::1424380312
1406 | 28::0::3::1424380312
1407 | 28::1::1::1424380312
1408 | 28::2::4::1424380312
1409 | 28::3::1::1424380312
1410 | 28::6::1::1424380312
1411 | 28::7::1::1424380312
1412 | 28::12::5::1424380312
1413 | 28::13::2::1424380312
1414 | 28::14::1::1424380312
1415 | 28::15::1::1424380312
1416 | 28::17::1::1424380312
1417 | 28::19::3::1424380312
1418 | 28::20::1::1424380312
1419 | 28::23::3::1424380312
1420 | 28::24::3::1424380312
1421 | 28::27::1::1424380312
1422 | 28::29::1::1424380312
1423 | 28::33::1::1424380312
1424 | 28::34::1::1424380312
1425 | 28::36::1::1424380312
1426 | 28::38::2::1424380312
1427 | 28::39::2::1424380312
1428 | 28::44::1::1424380312
1429 | 28::45::1::1424380312
1430 | 28::49::4::1424380312
1431 | 28::50::1::1424380312
1432 | 28::52::1::1424380312
1433 | 28::54::1::1424380312
1434 | 28::56::1::1424380312
1435 | 28::57::3::1424380312
1436 | 28::58::1::1424380312
1437 | 28::59::1::1424380312
1438 | 28::60::1::1424380312
1439 | 28::62::3::1424380312
1440 | 28::63::1::1424380312
1441 | 28::65::1::1424380312
1442 | 28::75::1::1424380312
1443 | 28::78::1::1424380312
1444 | 28::81::5::1424380312
1445 | 28::82::4::1424380312
1446 | 28::83::1::1424380312
1447 | 28::85::1::1424380312
1448 | 28::88::2::1424380312
1449 | 28::89::4::1424380312
1450 | 28::90::1::1424380312
1451 | 28::92::5::1424380312
1452 | 28::94::1::1424380312
1453 | 28::95::2::1424380312
1454 | 28::98::1::1424380312
1455 | 28::99::1::1424380312
1456 | 29::3::1::1424380312
1457 | 29::4::1::1424380312
1458 | 29::5::1::1424380312
1459 | 29::7::2::1424380312
1460 | 29::9::1::1424380312
1461 | 29::10::3::1424380312
1462 | 29::11::1::1424380312
1463 | 29::13::3::1424380312
1464 | 29::14::1::1424380312
1465 | 29::15::1::1424380312
1466 | 29::17::3::1424380312
1467 | 29::19::3::1424380312
1468 | 29::22::3::1424380312
1469 | 29::23::4::1424380312
1470 | 29::25::1::1424380312
1471 | 29::29::1::1424380312
1472 | 29::31::1::1424380312
1473 | 29::32::4::1424380312
1474 | 29::33::2::1424380312
1475 | 29::36::2::1424380312
1476 | 29::38::3::1424380312
1477 | 29::39::1::1424380312
1478 | 29::42::1::1424380312
1479 | 29::46::5::1424380312
1480 | 29::49::3::1424380312
1481 | 29::51::2::1424380312
1482 | 29::59::1::1424380312
1483 | 29::61::1::1424380312
1484 | 29::62::1::1424380312
1485 | 29::67::1::1424380312
1486 | 29::68::3::1424380312
1487 | 29::69::1::1424380312
1488 | 29::70::1::1424380312
1489 | 29::74::1::1424380312
1490 | 29::75::1::1424380312
1491 | 29::79::2::1424380312
1492 | 29::80::1::1424380312
1493 | 29::81::2::1424380312
1494 | 29::83::1::1424380312
1495 | 29::85::1::1424380312
1496 | 29::86::1::1424380312
1497 | 29::90::4::1424380312
1498 | 29::93::1::1424380312
1499 | 29::94::4::1424380312
1500 | 29::97::1::1424380312
1501 | 29::99::1::1424380312
1502 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_22/README.md:
--------------------------------------------------------------------------------
1 | ## Session 22 - Modelos de aprendizaje no supervisado con Spark Machine Learning
2 | ### Chapter 29 - Unsupervised Learning
3 |
4 | 
5 |
6 | ### Resumen
7 | Estaremos estudiando una vez más los modelos del MLlib de Apache Spark para resolver problemas de machine learning. Ahora, exploraremos las herramientas disponibles para el aprendizaje no supervisado, enfocándonos especificamente en el clustering, y veremos los desafíos de entrenar estos modelos en un entorno de computación distribuído.
8 |
9 | #### Grabación de la sesión
10 | Pendiente
11 |
12 | #### Nuestras redes sociales
13 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
14 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
15 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
16 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
17 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_23/README.md:
--------------------------------------------------------------------------------
1 | ## Session 23 - Analítica de Grafos con GraphX
2 | ### Chapter 30 - Graph Analytics
3 |
4 | 
5 |
6 | ### Resumen
7 | Estudiaremos el API de Apache Spark que unifica grafos y tablas denominado GraphX, que nos permite expresar una pipeline entera de analítica de grafos dentro de un sólo sistema. En esta sesión, veremos los pasos para construir y consultar un grafo con el GraphFrames, aplicaremos el Motif Finding para expresar patrones estructurales en un grafo, y exploraremos los algoritmos disponibles para analizar esta estructura de datos.
8 |
9 | El notebook (Scala) y el script se encuentran como [`Spark-Chapter_30_notebook.scala`](Spark-Chapter_30_notebook.scala) y [`Spark-Chapter_30.scala`](Spark-Chapter_30.scala), respectivamente. Se utilizó el conjunto de datos que acompaña el libro [aquí](https://github.com/databricks/Spark-The-Definitive-Guide/tree/master/data/bike-data).
10 |
11 | #### Grabación de la sesión
12 | [](https://www.youtube.com/watch?v=M-7ADrQ5MB0)
13 |
14 |
15 | #### Nuestras redes sociales
16 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
17 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
18 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
19 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
20 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_23/Spark-Chapter_30.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md
3 | // MAGIC # Chapter 30 - Graph Analytics
4 | // MAGIC
5 | // MAGIC ##### Index
6 | // MAGIC - Construir un grafo
7 | // MAGIC - Consultar un grafo
8 | // MAGIC - `Subquery`
9 | // MAGIC - Búsqueda de motivos
10 | // MAGIC - Algoritmos de grafos
11 | // MAGIC - `PageRank`
12 | // MAGIC - `Métricas de grado de entrada y grado de salida`
13 | // MAGIC - `Búsqueda por orden de importancia`
14 | // MAGIC - `Componente conectado`
15 | // MAGIC - `Componentes fuertemente conectados`
16 | // MAGIC - Conclusion
17 | // MAGIC
18 | // MAGIC - Documentación
19 | // MAGIC Spark: The Definitive Guide
20 |
21 | // COMMAND ----------
22 |
23 | // MAGIC %md
24 | // MAGIC #### Lectura
25 |
26 | // COMMAND ----------
27 |
28 | val bikeStations = spark.read.option("header","true").csv("/FileStore/dataset/201508_station_data.csv")
29 | val tripData = spark.read.option("header","true").csv("/FileStore/dataset/201508_trip_data.csv")
30 |
31 | // COMMAND ----------
32 |
33 | // MAGIC %md
34 | // MAGIC #### Construccion de un grafo
35 |
36 | // COMMAND ----------
37 |
38 | val stationVertices = bikeStations.withColumnRenamed("name", "id").distinct()
39 | val tripEdges = tripData.withColumnRenamed("Start Station", "src").withColumnRenamed("End Station", "dst")
40 |
41 | // COMMAND ----------
42 |
43 | display(stationVertices)
44 |
45 | // COMMAND ----------
46 |
47 | display(tripEdges)
48 |
49 | // COMMAND ----------
50 |
51 | import org.graphframes.GraphFrame
52 |
53 | // COMMAND ----------
54 |
55 | val stationGraph = GraphFrame(stationVertices, tripEdges)
56 | stationGraph.cache()
57 |
58 | // COMMAND ----------
59 |
60 | println(s"Total Number of Stations: ${stationGraph.vertices.count()}")
61 | println(s"Total Number of Trips in Graph: ${stationGraph.edges.count()}")
62 | println(s"Total Number of Trips in Original Data: ${tripData.count()}")
63 |
64 | // COMMAND ----------
65 |
66 | // MAGIC %md
67 | // MAGIC #### Consultas de grafo
68 |
69 | // COMMAND ----------
70 |
71 | import org.apache.spark.sql.functions.desc
72 |
73 | // COMMAND ----------
74 |
75 | display(
76 | stationGraph.edges.groupBy("src", "dst").count().orderBy(desc("count")).limit(10)
77 | )
78 |
79 | // COMMAND ----------
80 |
81 | display(
82 | stationGraph.edges
83 | .where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")
84 | .groupBy("src", "dst").count()
85 | .orderBy(desc("count")).limit(10)
86 | )
87 |
88 | // COMMAND ----------
89 |
90 | // MAGIC %md
91 | // MAGIC ##### Subgrafo
92 |
93 | // COMMAND ----------
94 |
95 | val townAnd7thEdges = stationGraph.edges.where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")
96 | val subgraph = GraphFrame(stationGraph.vertices, townAnd7thEdges)
97 |
98 | // COMMAND ----------
99 |
100 | subgraph.vertices.count()
101 |
102 | // COMMAND ----------
103 |
104 | subgraph.edges.where("dst='Townsend at 7th'").count()
105 |
106 | // COMMAND ----------
107 |
108 | display(subgraph.inDegrees)
109 |
110 | // COMMAND ----------
111 |
112 | // MAGIC %md
113 | // MAGIC #### Búsqueda con motifs
114 |
115 | // COMMAND ----------
116 |
117 | // MAGIC %md
118 | // MAGIC 
119 |
120 | // COMMAND ----------
121 |
122 | val motifs = stationGraph.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[ca]->(a)")
123 |
124 | // COMMAND ----------
125 |
126 | // MAGIC %md
127 | // MAGIC * ##### ¿cuál es el viaje más corto que ha realizado la bicicleta desde la estación a, hasta la estación b, hasta la estación c y de vuelta a la estación a?
128 |
129 | // COMMAND ----------
130 |
131 | import org.apache.spark.sql.functions.expr
132 |
133 | // COMMAND ----------
134 |
135 | display(
136 | motifs.selectExpr("*",
137 | "to_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm') as abStart",
138 | "to_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm') as bcStart",
139 | "to_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm') as caStart")
140 | .where("ca.`Bike #` = bc.`Bike #`").where("ab.`Bike #` = bc.`Bike #`")
141 | .where("a.id != b.id").where("b.id != c.id")
142 | .where("abStart < bcStart").where("bcStart < caStart")
143 | .orderBy(expr("cast(caStart as long) - cast(abStart as long)"))
144 | .selectExpr("a.id", "b.id", "c.id", "ab.`Start Date`", "ca.`End Date`")
145 | //.limit(1)
146 | )
147 |
148 | // COMMAND ----------
149 |
150 | // MAGIC %md
151 | // MAGIC ## Graph Algorithms
152 |
153 | // COMMAND ----------
154 |
155 | // MAGIC %md
156 | // MAGIC #### PageRank
157 |
158 | // COMMAND ----------
159 |
160 | val ranks = stationGraph.pageRank.resetProbability(0.15).maxIter(10).run()
161 | ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").show(10)
162 |
163 | // COMMAND ----------
164 |
165 | // MAGIC %md
166 | // MAGIC #### Métricas de grado de entrada y grado de salida
167 |
168 | // COMMAND ----------
169 |
170 | // MAGIC %md
171 | // MAGIC 
172 |
173 | // COMMAND ----------
174 |
175 | val inDeg = stationGraph.inDegrees
176 | inDeg.orderBy(desc("inDegree")).show(5, false)
177 |
178 | // COMMAND ----------
179 |
180 | val outDeg = stationGraph.outDegrees
181 | outDeg.orderBy(desc("outDegree")).show(5, false)
182 |
183 | // COMMAND ----------
184 |
185 | val degreeRatio = inDeg.join(outDeg, Seq("id")).selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio")
186 | degreeRatio.orderBy(desc("degreeRatio")).show(10, false)
187 | degreeRatio.orderBy("degreeRatio").show(10, false)
188 |
189 | // COMMAND ----------
190 |
191 | // MAGIC %md
192 | // MAGIC #### Búsqueda por orden de importancia
193 |
194 | // COMMAND ----------
195 |
196 | stationGraph.bfs.fromExpr("id = 'Townsend at 7th'").toExpr("id = 'Spear at Folsom'").maxPathLength(2).run().show(10)
197 |
198 | // COMMAND ----------
199 |
200 | // MAGIC %md
201 | // MAGIC #### Componente conectado
202 |
203 | // COMMAND ----------
204 |
205 | // MAGIC %md
206 | // MAGIC 
207 |
208 | // COMMAND ----------
209 |
210 | spark.sparkContext.setCheckpointDir("/tmp/checkpoints")
211 |
212 | // COMMAND ----------
213 |
214 | val minGraph = GraphFrame(stationVertices, tripEdges.sample(false, 0.1))
215 | val cc = minGraph.connectedComponents.run()
216 |
217 | // COMMAND ----------
218 |
219 | cc.where("component != 0").show()
220 |
221 | // COMMAND ----------
222 |
223 | // MAGIC %md
224 | // MAGIC #### Componentes fuertemente conectados
225 |
226 | // COMMAND ----------
227 |
228 | val scc = minGraph.stronglyConnectedComponents.maxIter(3).run()
229 |
230 | // COMMAND ----------
231 |
232 |
233 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_24/README.md:
--------------------------------------------------------------------------------
1 | ## Session 24 - Deep Learning en Apache Spark
2 | ### Chapter 31 - Deep Learning
3 |
4 | 
5 |
6 | ### Resumen
7 | En esta sesión, estudiaremos los modelos de Deep Learning en Apache Spark. Nos familiarizaremos con el Deep Learning, entenderemos su importancia en Apache Spark, exploraremos las librerías más populares y haremos un ejemplo de implementación en un pipeline.
8 |
9 | El notebook que se utilizó durante la sesión se denomina [`SparkDL.ipynb`](SparkDL.ipynb).
10 |
11 | #### Grabación de la sesión
12 | [](https://www.youtube.com/watch?v=WOf-VNnfz60)
13 |
14 |
15 | #### Nuestras redes sociales
16 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
17 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
18 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
19 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
20 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_5/README.md:
--------------------------------------------------------------------------------
1 | ## Session 5 - Operaciones estructuradas básicas
2 | ### Chapter 5 - Basic Structured Operations
3 |
4 | 
5 |
6 | ### Resumen
7 | En esta sesión se habló sobre las operaciones de estructuras basicas en Spark. Entre los diferentes temas tratados se encontraban:
8 |
9 | * Schemas
10 |
11 | * Columns and Expressions:
12 | * Columns
13 | * Expressions
14 |
15 | * Records and Rows:
16 | * Creating Rows
17 |
18 | * DataFrame Transformations:
19 | * Creating DataFrames
20 | * select and selectExpr
21 | * Converting to Spark Types (Literals)
22 | * Adding Columns
23 | * Renaming Columns
24 | * Reserved Characters and Keywords
25 | * Case Sensitivity
26 | * Removing Columns
27 | * Changing a Column’s Type (cast)
28 | * Filtering Rows
29 | * Getting Unique Rows
30 | * Random Samples
31 | * Random Splits
32 | * Concatenating and Appending Rows (Union)
33 | * Sorting Rows
34 | * Limit
35 | * Repartition and Coalesce
36 | * Collecting Rows to the Driver
37 |
38 | Ejemplos expuestos fueron ejecutados en un archivo scala denominado `basic_structured_operation.scala`
39 |
40 | #### Grabación de la sesión
41 |
42 | [](https://www.youtube.com/watch?v=CxnTp5ZDAGE)
43 |
44 | #### Nuestras redes sociales
45 | * [Youtube](https://www.youtube.com/channel/UCqFCoUEvxR23ymmih0GD7mQ?sub_confirmation=1 'Subscríbate al canal')
46 | * [Linkedin](https://www.linkedin.com/company/data-engineering-latam/ 'Síganos en Linkedin')
47 | * [Facebook](https://www.facebook.com/dataengineeringlatam/ 'Síganos en Facebook')
48 | * [Website](https://beacons.ai/dataengineeringlatam 'Nuestro website')
49 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_5/basic_structured_operation.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | // MAGIC %md
3 | // MAGIC # Chapter 5 - Basic Structured Operations
4 | // MAGIC
5 | // MAGIC ##### Index
6 | // MAGIC - Schemas
7 | // MAGIC - Columns and Expressions:
8 | // MAGIC - `Columns`
9 | // MAGIC - `Expressions`
10 | // MAGIC - Records and Rows:
11 | // MAGIC - `Creating Rows`
12 | // MAGIC - DataFrame Transformations:
13 | // MAGIC - `Creating DataFrames`
14 | // MAGIC - `select and selectExpr`
15 | // MAGIC - `Converting to Spark Types (Literals)`
16 | // MAGIC - `Adding Columns`
17 | // MAGIC - `Renaming Columns`
18 | // MAGIC - `Reserved Characters and Keywords`
19 | // MAGIC - `Case Sensitivity`
20 | // MAGIC - `Removing Columns`
21 | // MAGIC - `Changing a Column’s Type (cast)`
22 | // MAGIC - `Filtering Rows`
23 | // MAGIC - `Getting Unique Rows`
24 | // MAGIC - `Random Samples`
25 | // MAGIC - `Random Splits`
26 | // MAGIC - `Concatenating and Appending Rows (Union)`
27 | // MAGIC - `Sorting Rows`
28 | // MAGIC - `Limit`
29 | // MAGIC - `Repartition and Coalesce`
30 | // MAGIC - `Collecting Rows to the Driver`
31 | // MAGIC - Conclusion
32 | // MAGIC
33 | // MAGIC - Documentación
34 | // MAGIC Spark: The Definitive Guide
35 |
36 | // COMMAND ----------
37 |
38 | var df = spark.read.format("json").load("/FileStore/dataset/2015_summary.json")
39 |
40 | // COMMAND ----------
41 |
42 | // MAGIC %md
43 | // MAGIC ### Schemas
44 |
45 | // COMMAND ----------
46 |
47 | df.printSchema()
48 |
49 | // COMMAND ----------
50 |
51 | df.schema
52 |
53 | // COMMAND ----------
54 |
55 | import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}
56 | import org.apache.spark.sql.types.Metadata
57 |
58 | val myManualSchema = StructType(Array(
59 | StructField(
60 | "DEST_COUNTRY_NAME", StringType, true, Metadata.fromJson("{\"description\":\"Origin Country Name\"}")
61 | ),
62 | StructField(
63 | "ORIGIN_COUNTRY_NAME", StringType, true, Metadata.fromJson("{\"description\":\"Destine Country Name\"}")
64 | ),
65 | StructField(
66 | "count", LongType, false, Metadata.fromJson("{\"description\":\"number of flights\"}")
67 | )
68 | ))
69 |
70 | var df = spark.read.format("json").schema(myManualSchema).load("/FileStore/dataset/2015_summary.json")
71 |
72 | // COMMAND ----------
73 |
74 | df.printSchema()
75 | df.schema
76 |
77 | // COMMAND ----------
78 |
79 | df.schema.json
80 |
81 | // COMMAND ----------
82 |
83 | df.schema.foreach{s => println(s"${s.name}, ${s.metadata.toString}")}
84 |
85 | // COMMAND ----------
86 |
87 | display(spark.catalog.listDatabases)
88 |
89 | // COMMAND ----------
90 |
91 | // MAGIC %md
92 | // MAGIC ### Columns and Expressions
93 |
94 | // COMMAND ----------
95 |
96 | // MAGIC %md
97 | // MAGIC - ##### Columns
98 |
99 | // COMMAND ----------
100 |
101 | import org.apache.spark.sql.functions.{col, column}
102 |
103 | col("someColumnName")
104 | column("someColumnName")
105 |
106 | // COMMAND ----------
107 |
108 | $"myColumn"
109 |
110 | // COMMAND ----------
111 |
112 | 'myColumn
113 |
114 | // COMMAND ----------
115 |
116 | df.col("count")
117 |
118 | // COMMAND ----------
119 |
120 | // MAGIC %md
121 | // MAGIC - ##### Expressions
122 |
123 | // COMMAND ----------
124 |
125 | // MAGIC %md
126 | // MAGIC - ###### Columns as expressions
127 | // MAGIC
128 | // MAGIC expr("someCol") is equivalent to col("someCol")
129 |
130 | // COMMAND ----------
131 |
132 | // MAGIC %md
133 | // MAGIC 
134 |
135 | // COMMAND ----------
136 |
137 | import org.apache.spark.sql.functions.expr
138 | expr("(((someCol + 5) * 200) - 6) < otherCol")
139 |
140 | // COMMAND ----------
141 |
142 | // MAGIC %md
143 | // MAGIC - ###### Accessing a DataFrame’s columns
144 |
145 | // COMMAND ----------
146 |
147 | spark.read.format("json").load("/FileStore/dataset/2015_summary.json").columns
148 |
149 | // COMMAND ----------
150 |
151 | // MAGIC %md
152 | // MAGIC ### Records and Rows
153 |
154 | // COMMAND ----------
155 |
156 | df.first()
157 |
158 | // COMMAND ----------
159 |
160 | // MAGIC %md
161 | // MAGIC - ##### Creating Rows
162 |
163 | // COMMAND ----------
164 |
165 | var myRow = Row("Hello",null,1,false)
166 |
167 | // COMMAND ----------
168 |
169 | println(myRow(0)) // type Any
170 | println(myRow(0).asInstanceOf[String]) // String
171 | println(myRow.isNullAt(1))// isNull
172 | println(myRow.getInt(2)) // String
173 | println(myRow.getBoolean(3)) // Bool
174 |
175 | // COMMAND ----------
176 |
177 | // MAGIC %md
178 | // MAGIC ### DataFrame Transformations
179 | // MAGIC 
180 |
181 | // COMMAND ----------
182 |
183 | // MAGIC %md
184 | // MAGIC - ##### Creating DataFrames
185 |
186 | // COMMAND ----------
187 |
188 | var df = spark.read.format("json").load("/FileStore/dataset/2015_summary.json")
189 | df.createOrReplaceTempView("dfTable")
190 |
191 | // COMMAND ----------
192 |
193 | // MAGIC %sql
194 | // MAGIC select * from dfTable;
195 |
196 | // COMMAND ----------
197 |
198 | import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}
199 |
200 | var myManualSchema = new StructType(Array(
201 | new StructField("some", StringType, true),
202 | new StructField("col", StringType, true),
203 | new StructField("names", LongType, false)))
204 |
205 | var myRows = Seq(Row("Hello", null, 1L))
206 | var myRDD = spark.sparkContext.parallelize(myRows)
207 | var myDf = spark.createDataFrame(myRDD, myManualSchema)
208 |
209 | display(myDf)
210 |
211 | // COMMAND ----------
212 |
213 | df.createOrReplaceTempView("tmpView")
214 |
215 | spark.sql("create table default.summar as select * from tmpView")
216 |
217 | // COMMAND ----------
218 |
219 | spark.catalog.listColumns("default","summar").show
220 |
221 | // COMMAND ----------
222 |
223 | spark.sql("show tables").show()
224 |
225 | // COMMAND ----------
226 |
227 | // MAGIC %sql
228 | // MAGIC show tables;
229 |
230 | // COMMAND ----------
231 |
232 | // MAGIC %md
233 | // MAGIC - ##### select and selectExpr
234 |
235 | // COMMAND ----------
236 |
237 | // MAGIC %sql
238 | // MAGIC SELECT * FROM dataFrameTable
239 | // MAGIC SELECT columnName FROM dataFrameTable
240 | // MAGIC SELECT columnName * 10, otherColumn, someOtherCol as c FROM dataFrameTable
241 |
242 | // COMMAND ----------
243 |
244 | df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)
245 |
246 | // COMMAND ----------
247 |
248 | // MAGIC %sql
249 | // MAGIC SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME FROM dfTable LIMIT 2
250 |
251 | // COMMAND ----------
252 |
253 | import org.apache.spark.sql.functions.{expr, col, column}
254 |
255 | df.select(
256 | df.col("DEST_COUNTRY_NAME"),
257 | col("DEST_COUNTRY_NAME"),
258 | column("DEST_COUNTRY_NAME"),
259 | 'DEST_COUNTRY_NAME,
260 | $"DEST_COUNTRY_NAME",
261 | expr("DEST_COUNTRY_NAME")
262 | ).show(2)
263 |
264 | // COMMAND ----------
265 |
266 | df.select(col("DEST_COUNTRY_NAME"), "DEST_COUNTRY_NAME")
267 |
268 | // COMMAND ----------
269 |
270 | df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)
271 |
272 | // COMMAND ----------
273 |
274 | // MAGIC %sql
275 | // MAGIC SELECT DEST_COUNTRY_NAME as destination FROM dfTable LIMIT 2
276 |
277 | // COMMAND ----------
278 |
279 | df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")).show(2)
280 |
281 | // COMMAND ----------
282 |
283 | df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)
284 |
285 | // COMMAND ----------
286 |
287 | df.selectExpr(
288 | "*", // include all original columns
289 | "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")
290 | .show(2)
291 |
292 | // COMMAND ----------
293 |
294 | // MAGIC %sql
295 | // MAGIC SELECT *, (DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry
296 | // MAGIC FROM dfTable
297 | // MAGIC LIMIT 2
298 |
299 | // COMMAND ----------
300 |
301 | df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)
302 |
303 | // COMMAND ----------
304 |
305 | // MAGIC %sql
306 | // MAGIC SELECT avg(count), count(distinct(DEST_COUNTRY_NAME)) FROM dfTable LIMIT 2
307 |
308 | // COMMAND ----------
309 |
310 | import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}
311 | import org.apache.spark.sql.types.Metadata
312 | import org.apache.spark.sql.functions.{expr, col, column, lit}
313 | import spark.implicits._ // For implicit conversions
314 |
315 | val mySchema = StructType(Array(
316 | StructField(
317 | "DEST_COUNTRY_NAME", StringType, true, Metadata.fromJson("{\"description\":\"Origin Country Name\"}")
318 | ),
319 | StructField(
320 | "ORIGIN_COUNTRY_NAME", StringType, true, Metadata.fromJson("{\"description\":\"Destine Country Name\"}")
321 | ),
322 | StructField(
323 | "count", LongType, false, Metadata.fromJson("{\"description\":\"number of flights\"}")
324 | )
325 | ))
326 |
327 | var df = spark.read.format("json").schema(mySchema).load("/FileStore/dataset/2015_summary.json")
328 | df.createOrReplaceTempView("dfTable")
329 |
330 | // COMMAND ----------
331 |
332 | // MAGIC %md
333 | // MAGIC - ##### Converting to Spark Types (Literals)
334 |
335 | // COMMAND ----------
336 |
337 | df.select(expr("*"), lit(1).as("One")).show(2)
338 |
339 | // COMMAND ----------
340 |
341 | // MAGIC %sql
342 | // MAGIC SELECT *, 1 as One FROM dfTable LIMIT 2
343 |
344 | // COMMAND ----------
345 |
346 | // MAGIC %md
347 | // MAGIC - ##### Adding Columns
348 |
349 | // COMMAND ----------
350 |
351 | var df_new = df.withColumn("numberOne", lit(1))
352 | df_new.show(2)
353 |
354 | // COMMAND ----------
355 |
356 | df_new = df_new.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))
357 | df_new.show(2)
358 |
359 | // COMMAND ----------
360 |
361 | df_new.printSchema()
362 |
363 | // COMMAND ----------
364 |
365 | display(df_new_schema)
366 |
367 | // COMMAND ----------
368 |
369 | var cols_new2=List(("user2",lit("TIC")),("fecha2",expr("current_date()")))
370 | var df_new_schema2 = cols_new2.foldLeft(df_new){ (tempdf, cols) => tempdf.withColumn(cols._1,cols._2) }
371 | df_new_schema2.printSchema()
372 |
373 | // COMMAND ----------
374 |
375 | display(df_new_schema2)
376 |
377 | // COMMAND ----------
378 |
379 | // MAGIC %md
380 | // MAGIC - ##### Renaming Columns
381 |
382 | // COMMAND ----------
383 |
384 | df_new_schema2.withColumnRenamed("DEST_COUNTRY_NAME", "dest").show(2)
385 |
386 | // COMMAND ----------
387 |
388 | // MAGIC %md
389 | // MAGIC - ##### Reserved Characters and Keywords
390 |
391 | // COMMAND ----------
392 |
393 | display(df)
394 |
395 | // COMMAND ----------
396 |
397 | import org.apache.spark.sql.functions.expr
398 |
399 | var dfWithLongColName = df.withColumn(
400 | "This Long Column-Name",
401 | expr("ORIGIN_COUNTRY_NAME"))
402 |
403 | // COMMAND ----------
404 |
405 | dfWithLongColName.show(2)
406 |
407 | // COMMAND ----------
408 |
409 | dfWithLongColName.selectExpr(
410 | "`This Long Column-Name`",
411 | "`This Long Column-Name` as `new col`")
412 | .show(2)
413 |
414 | // COMMAND ----------
415 |
416 | dfWithLongColName.createOrReplaceTempView("dfTableLong")
417 |
418 | // COMMAND ----------
419 |
420 | // MAGIC %sql
421 | // MAGIC SELECT `This Long Column-Name`, `This Long Column-Name` as `new col`
422 | // MAGIC FROM dfTableLong LIMIT 2
423 |
424 | // COMMAND ----------
425 |
426 | // MAGIC %md
427 | // MAGIC - ##### Case Sensitivity
428 |
429 | // COMMAND ----------
430 |
431 | spark.conf.get("spark.sql.caseSensitive");
432 | //true-false
433 | spark.conf.set("spark.sql.caseSensitive", "False");
434 |
435 |
436 | // COMMAND ----------
437 |
438 | // MAGIC %md
439 | // MAGIC - ##### Removing Columns
440 |
441 | // COMMAND ----------
442 |
443 | df.drop("ORIGIN_COUNTRY_NAME").columns
444 |
445 | // COMMAND ----------
446 |
447 | dfWithLongColName.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").columns
448 |
449 | // COMMAND ----------
450 |
451 | // MAGIC %md
452 | // MAGIC - ##### Changing a Column’s Type (cast)
453 |
454 | // COMMAND ----------
455 |
456 | df.withColumn("count2", col("count").cast("long"))
457 |
458 | // COMMAND ----------
459 |
460 | // MAGIC %sql
461 | // MAGIC SELECT *, cast(count as long) AS count2 FROM dfTable
462 |
463 | // COMMAND ----------
464 |
465 | // MAGIC %md
466 | // MAGIC - ##### Filtering Rows
467 |
468 | // COMMAND ----------
469 |
470 | df.filter(col("count") < 2).show(2)
471 |
472 | // COMMAND ----------
473 |
474 | df.where("count < 2").show(2)
475 |
476 | // COMMAND ----------
477 |
478 | df.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") =!= "Croatia")
479 | .show(2)
480 |
481 | // COMMAND ----------
482 |
483 | // MAGIC %md
484 | // MAGIC - ##### Getting Unique Rows
485 |
486 | // COMMAND ----------
487 |
488 | df.select("ORIGIN_COUNTRY_NAME").count()
489 |
490 | // COMMAND ----------
491 |
492 | df.select("ORIGIN_COUNTRY_NAME").distinct().count()
493 |
494 | // COMMAND ----------
495 |
496 | // MAGIC %md
497 | // MAGIC - ##### Random Samples
498 |
499 | // COMMAND ----------
500 |
501 | var seed = 5
502 | var withReplacement = false
503 | var fraction = 0.8
504 | df.sample(withReplacement, fraction, seed).count()
505 |
506 | // COMMAND ----------
507 |
508 | // MAGIC %md
509 | // MAGIC - ##### Random Splits
510 |
511 | // COMMAND ----------
512 |
513 | val dataFrames = df.randomSplit(Array(0.70, 0.30), seed)
514 |
515 | // COMMAND ----------
516 |
517 | dataFrames(0).count() > dataFrames(1).count()
518 |
519 | // COMMAND ----------
520 |
521 | display(dataFrames(0))
522 |
523 | // COMMAND ----------
524 |
525 | // MAGIC %md
526 | // MAGIC - ##### Concatenating and Appending Rows (Union)
527 |
528 | // COMMAND ----------
529 |
530 | import org.apache.spark.sql.Row
531 |
532 | var schema = df.schema
533 | var newRows = Seq(
534 | Row("New Country", "Other Country", 5L),
535 | Row("New Country 2", "Other Country 3", 1L)
536 | )
537 |
538 | var parallelizedRows = spark.sparkContext.parallelize(newRows)
539 | var newDF = spark.createDataFrame(parallelizedRows, schema)
540 |
541 | df.union(newDF)
542 | .where("count = 1")
543 | .where($"ORIGIN_COUNTRY_NAME" =!= "United States")
544 | .show()
545 |
546 | // COMMAND ----------
547 |
548 | // MAGIC %md
549 | // MAGIC - ##### Sorting Rows
550 |
551 | // COMMAND ----------
552 |
553 | df.sort("count").show(5)
554 |
555 | // COMMAND ----------
556 |
557 | df.orderBy("count", "DEST_COUNTRY_NAME").show(2)
558 |
559 | // COMMAND ----------
560 |
561 | import org.apache.spark.sql.functions.{desc, asc}
562 |
563 | df.orderBy(expr("count desc")).show(2)
564 |
565 | // COMMAND ----------
566 |
567 | df.orderBy(desc("count"), asc("DEST_COUNTRY_NAME")).show(2)
568 |
569 | df.createOrReplaceTempView("dfTable")
570 |
571 | // COMMAND ----------
572 |
573 | // MAGIC %sql
574 | // MAGIC SELECT * FROM dfTable ORDER BY count DESC, DEST_COUNTRY_NAME ASC LIMIT 2
575 |
576 | // COMMAND ----------
577 |
578 | var dfSort = spark.read.format("json").load("/FileStore/dataset/2015_summary.json")
579 | .sortWithinPartitions("count")
580 |
581 | // COMMAND ----------
582 |
583 | display(dfSort)
584 |
585 | // COMMAND ----------
586 |
587 | // MAGIC %md
588 | // MAGIC - ##### Limit
589 |
590 | // COMMAND ----------
591 |
592 | df.limit(5).show()
593 |
594 | // COMMAND ----------
595 |
596 | df.orderBy(expr("count desc")).limit(6).show()
597 |
598 | // COMMAND ----------
599 |
600 | // MAGIC %sql
601 | // MAGIC SELECT * FROM dfTable ORDER BY count desc LIMIT 6
602 |
603 | // COMMAND ----------
604 |
605 | // MAGIC %md
606 | // MAGIC - ##### Repartition and Coalesce
607 |
608 | // COMMAND ----------
609 |
610 | // MAGIC %md
611 | // MAGIC * ###### Repartition
612 |
613 | // COMMAND ----------
614 |
615 | df.repartition(5)
616 |
617 | // COMMAND ----------
618 |
619 | df.repartition(col("DEST_COUNTRY_NAME"))
620 |
621 | // COMMAND ----------
622 |
623 | df.repartition(5, col("DEST_COUNTRY_NAME"))
624 |
625 | // COMMAND ----------
626 |
627 | // MAGIC %md
628 | // MAGIC * ###### Coalesce
629 |
630 | // COMMAND ----------
631 |
632 | df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)
633 |
634 | // COMMAND ----------
635 |
636 | // MAGIC %md
637 | // MAGIC - ##### Collecting Rows to the Driver
638 |
639 | // COMMAND ----------
640 |
641 | var collectDF = df.limit(10)
642 |
643 | // COMMAND ----------
644 |
645 | display(collectDF)
646 |
647 | // COMMAND ----------
648 |
649 | var df1 = collectDF.take(5)
650 | display(df1)
651 |
652 | // COMMAND ----------
653 |
654 | collectDF.show()
655 |
656 | // COMMAND ----------
657 |
658 | collectDF.show(5, false)
659 |
660 | // COMMAND ----------
661 |
662 | collectDF.collect()
663 |
664 | // COMMAND ----------
665 |
666 | collectDF.toLocalIterator()
667 |
--------------------------------------------------------------------------------
/Spark - The Definite Guide/sessions/session_5/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |