-- Databricks notebook source
-- MAGIC
-- MAGIC %md-sandbox
-- MAGIC
-- MAGIC
-- MAGIC

-- MAGIC
-- COMMAND ----------
-- MAGIC %md
-- MAGIC # Engineering Data Pipelines
-- MAGIC ## Module 3 Assignment
-- MAGIC
-- MAGIC ##  In this assignment you:
-- MAGIC * Create a table with persistent data and a specified schema
-- MAGIC * Populate table with specific entries
-- MAGIC * Change partition number to compare query speeds
-- MAGIC
-- MAGIC For each **bold** question, input its answer in Coursera.
-- COMMAND ----------
-- MAGIC %run ../Includes/Classroom-Setup
-- COMMAND ----------
-- MAGIC %md-sandbox
-- MAGIC Create a table whose data will remain after you drop the table and after the cluster shuts down. Name this table `newTable` and specify the location to be at `/tmp/newTableLoc`
-- MAGIC
-- MAGIC Set up the table to have the following schema:
-- MAGIC
-- MAGIC ```
-- MAGIC `Address` STRING,
-- MAGIC `City` STRING,
-- MAGIC `Battalion` STRING,
-- MAGIC `Box` STRING,
-- MAGIC ```
-- MAGIC
-- MAGIC Run the following cell first to remove any files stored at `/tmp/newTableLoc` before creating our table. Be sure to first re-run that cell each time you create `newTable`.
-- MAGIC
-- MAGIC
This course was designed to work with Databricks Runtime 5.5 LTS ML, which uses Spark 2.4. If you are running a later version of the Databricks Runtime, you might have to add an additional `STORED AS parquet` to your query [due to a bug.](https://issues.apache.org/jira/browse/SPARK-30436)
-- COMMAND ----------
-- MAGIC %python
-- MAGIC # removes files stored at '/tmp/newTableLoc'
-- MAGIC dbutils.fs.rm("/tmp/newTableLoc", True)
-- COMMAND ----------
-- TODO
DROP TABLE IF EXISTS newTable;
CREATE EXTERNAL TABLE newTable (
`Address` STRING,
`City` STRING,
`Battalion` STRING,
`Box` STRING
)
STORED AS parquet
LOCATION '/tmp/newTableLoc'
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Check that the data type of each column is what we want.
-- MAGIC
-- MAGIC ### Question 1
-- MAGIC **What type of table is `newTable`? "EXTERNAL" or "MANAGED"?**
-- COMMAND ----------
DESCRIBE EXTENDED newTable
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Run the following cell to read in the data stored at `/mnt/davis/fire-calls/fire-calls-truncated.json`. Check that the columns of the data are of the correct types (not all strings).
-- COMMAND ----------
CREATE OR REPLACE TEMPORARY VIEW fireCallsJSON (
`ALS Unit` boolean,
`Address` string,
`Available DtTm` string,
`Battalion` string,
`Box` string,
`Call Date` string,
`Call Final Disposition` string,
`Call Number` long,
`Call Type` string,
`Call Type Group` string,
`City` string,
`Dispatch DtTm` string,
`Entry DtTm` string,
`Final Priority` long,
`Fire Prevention District` string,
`Hospital DtTm` string,
`Incident Number` long,
`Location` string,
`Neighborhooods - Analysis Boundaries` string,
`Number of Alarms` long,
`On Scene DtTm` string,
`Original Priority` string,
`Priority` string,
`Received DtTm` string,
`Response DtTm` string,
`RowID` string,
`Station Area` string,
`Supervisor District` string,
`Transport DtTm` string,
`Unit ID` string,
`Unit Type` string,
`Unit sequence in call dispatch` long,
`Watch Date` string,
`Zipcode of Incident` long
)
USING JSON
OPTIONS (
path "/mnt/davis/fire-calls/fire-calls-truncated.json"
);
DESCRIBE fireCallsJSON
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Take a look at the table to make sure it looks correct.
-- COMMAND ----------
SELECT * FROM fireCallsJSON LIMIT 10
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Now let's populate `newTable` with some of the rows from the `fireCallsJSON` table you just loaded. We only want to include fire calls whose `Final Priority` is `3`.
-- COMMAND ----------
-- TODO
INSERT INTO newTable
SELECT Address, City, Battalion, Box
FROM fireCallsJSON
WHERE `Final Priority`=3;
-- COMMAND ----------
select * from newTable
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ### Question 2
-- MAGIC
-- MAGIC **How many rows are in `newTable`? **
-- COMMAND ----------
-- TODO
SELECT COUNT(*) FROM newTable
-- COMMAND ----------
-- MAGIC %md
-- MAGIC
-- MAGIC Sort the rows of `newTable` by ascending `Battalion`.
-- MAGIC
-- MAGIC ### Question 3
-- MAGIC
-- MAGIC **What is the "Battalion" of the first entry in the sorted table?**
-- COMMAND ----------
-- TODO
SELECT * FROM newTable
ORDER BY Battalion ASC
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Let's see how this table is stored in our file system.
-- MAGIC
-- MAGIC Note: You should have specified the location of the table to be `/tmp/newTableLoc` when you created it.
-- COMMAND ----------
-- MAGIC %fs ls dbfs:/tmp/newTableLoc
-- COMMAND ----------
-- MAGIC %md
-- MAGIC
-- MAGIC First run the following cell to check how many partitions are in this table. Did the number of partitions match the number of files our data was stored as?
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Let's try increasing the number of partitions to 256. Create this as a new table and call it `newTablePartitioned`
-- COMMAND ----------
-- TODO
CREATE TABLE IF NOT EXISTS newTablePartitioned
AS
SELECT /*+ REPARTITION(256) */ *
FROM newTable
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Now let's take a look at how this new table is stored.
-- COMMAND ----------
DESCRIBE EXTENDED newTablePartitioned
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Copy the location of the `newTablePartitioned` from the table above and take a look at the files stored at that location. Now how many parts is our data stored?
-- COMMAND ----------
-- MAGIC %fs ls dbfs:/user/hive/warehouse/databricks.db/newtablepartitioned
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Now sort the rows of `newTablePartitioned` by ascending `Battalion` and compare how long this query takes.
-- MAGIC
-- MAGIC ### Question 4
-- MAGIC
-- MAGIC **Was this query faster or slower on the table with increased partitions?**
-- COMMAND ----------
SELECT * FROM newTablePartitioned ORDER BY `Battalion`
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Run the following cell to see where the data of the original `newTable` is stored.
-- COMMAND ----------
-- MAGIC %fs ls dbfs:/tmp/newTableLoc
-- COMMAND ----------
-- MAGIC %md
-- MAGIC Now drop the table `newTable`.
-- COMMAND ----------
DROP TABLE newTable;
-- The following line should error!
-- SELECT * FROM newTable;
-- COMMAND ----------
-- MAGIC %md
-- MAGIC ### Question 5
-- MAGIC
-- MAGIC **Does the data stored within the table still exist at the original location (`dbfs:/tmp/newTableLoc`) after you dropped the table? (Answer "yes" or "no")**
-- COMMAND ----------
-- MAGIC %fs ls dbfs:/tmp/newTableLoc
-- COMMAND ----------
-- MAGIC %md-sandbox
-- MAGIC © 2020 Databricks, Inc. All rights reserved.
-- MAGIC Apache, Apache Spark, Spark and the Spark logo are trademarks of the Apache Software Foundation.
-- MAGIC
-- MAGIC Privacy Policy | Terms of Use | Support
Footer
© 2023 GitHub, Inc.
Footer navigation
Terms
Privacy
Security
Status
Docs
Contact GitHub
Pricing
API
Training
Blog
About