Skip to content

Commit 33d16ad

Browse files
committedDec 29, 2023
Added Missing Notebooks
1 parent c42be84 commit 33d16ad

File tree

6 files changed

+226
-0
lines changed

6 files changed

+226
-0
lines changed
 

‎01-getting-started.ipynb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"cells":[{"cell_type":"code","source":["diamonds_df = spark.read.format(\"csv\") \\\n .option(\"header\", \"true\") \\\n .option(\"inferSchema\", \"true\") \\\n .load(\"/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv\")\n\ndiamonds_df.show(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b1b0d245-342b-472b-9c14-9e7883cf73f4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["from pyspark.sql.functions import avg\n\nresults_df = diamonds_df.select(\"color\", \"price\") \\\n .groupBy(\"color\") \\\n .agg(avg(\"price\")) \\\n .sort(\"color\")\n\nresults_df.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"015c9e24-1834-4494-a61e-f842bf41371d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(results_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c6d78c3a-7eb6-4376-af28-4bde47c83b4c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"32c660e0-a73f-43d4-9695-affcc19a0a2d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"01-getting-started","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":2879982568079096}},"nbformat":4,"nbformat_minor":0}

‎02-spark-dataframe-demo.ipynb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"cells":[{"cell_type":"code","source":["raw_fire_df = spark.read \\\n .format(\"csv\") \\\n .option(\"header\", \"true\") \\\n .option(\"inferSchema\", \"true\") \\\n .load(\"/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1493b35d-a05e-4322-949c-2c6a7db9e146"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["raw_fire_df.show(10)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"33c9f1f1-299b-45d9-b7cf-7940ac9e1d80"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(raw_fire_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b333860d-334b-42a4-b073-a98bc58b1c43"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["raw_fire_df.createGlobalTempView(\"fire_service_calls_view\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"56d2f0d6-90a7-4399-8f09-9dead0bbf526"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["%sql\nselect * from global_temp.fire_service_calls_view"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"75c57597-0f81-40a7-885f-64829f3db180"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"19f40c42-622a-43a2-bd56-d182e528fe6b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"02-spark-dataframe-demo","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":2787702214819532}},"nbformat":4,"nbformat_minor":0}

‎03-spark-table-demo.sql

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
-- Databricks notebook source
2+
drop table if exists demo_db.fire_service_calls_tbl;
3+
drop view if exists demo_db;
4+
5+
-- COMMAND ----------
6+
7+
-- MAGIC %fs rm -r /user/hive/warehouse/demo_db.db
8+
9+
-- COMMAND ----------
10+
11+
create database if not exists demo_db
12+
13+
-- COMMAND ----------
14+
15+
create table if not exists demo_db.fire_service_calls_tbl(
16+
CallNumber integer,
17+
UnitID string,
18+
IncidentNumber integer,
19+
CallType string,
20+
CallDate string,
21+
WatchDate string,
22+
CallFinalDisposition string,
23+
AvailableDtTm string,
24+
Address string,
25+
City string,
26+
Zipcode integer,
27+
Battalion string,
28+
StationArea string,
29+
Box string,
30+
OriginalPriority string,
31+
Priority string,
32+
FinalPriority integer,
33+
ALSUnit boolean,
34+
CallTypeGroup string,
35+
NumAlarms integer,
36+
UnitType string,
37+
UnitSequenceInCallDispatch integer,
38+
FirePreventionDistrict string,
39+
SupervisorDistrict string,
40+
Neighborhood string,
41+
Location string,
42+
RowID string,
43+
Delay float
44+
) using parquet
45+
46+
-- COMMAND ----------
47+
48+
insert into demo_db.fire_service_calls_tbl
49+
values(1234, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null,
50+
null, null, null, null, null, null, null, null, null)
51+
52+
-- COMMAND ----------
53+
54+
select * from demo_db.fire_service_calls_tbl
55+
56+
-- COMMAND ----------
57+
58+
truncate table demo_db.fire_service_calls_tbl
59+
60+
-- COMMAND ----------
61+
62+
insert into demo_db.fire_service_calls_tbl
63+
select * from global_temp.fire_service_calls_view
64+
65+
-- COMMAND ----------
66+
67+
select * from demo_db.fire_service_calls_tbl
68+
69+
-- COMMAND ----------
70+
71+

‎04-spark-sql-demo.sql

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
-- Databricks notebook source
2+
select * from demo_db.fire_service_calls_tbl limit 100
3+
4+
-- COMMAND ----------
5+
6+
drop view if exists fire_service_calls_tbl_cache;
7+
8+
-- COMMAND ----------
9+
10+
cache lazy table fire_service_calls_tbl_cache as
11+
select * from demo_db.fire_service_calls_tbl
12+
13+
-- COMMAND ----------
14+
15+
select count(*) from demo_db.fire_service_calls_tbl
16+
17+
-- COMMAND ----------
18+
19+
-- MAGIC %md
20+
-- MAGIC ##### Q1. How many distinct types of calls were made to the Fire Department?
21+
22+
-- COMMAND ----------
23+
24+
select count(distinct callType) as distinct_call_type_count
25+
from demo_db.fire_service_calls_tbl
26+
where callType is not null
27+
28+
-- COMMAND ----------
29+
30+
-- MAGIC %md
31+
-- MAGIC ##### Q2. What were distinct types of calls made to the Fire Department?
32+
33+
-- COMMAND ----------
34+
35+
select distinct callType as distinct_call_types
36+
from demo_db.fire_service_calls_tbl
37+
where callType is not null
38+
39+
-- COMMAND ----------
40+
41+
-- MAGIC %md
42+
-- MAGIC ##### Q3. Find out all response for delayed times greater than 5 mins?
43+
44+
-- COMMAND ----------
45+
46+
select callNumber, Delay
47+
from demo_db.fire_service_calls_tbl
48+
where Delay > 5
49+
50+
-- COMMAND ----------
51+
52+
-- MAGIC %md
53+
-- MAGIC ##### Q4. What were the most common call types?
54+
55+
-- COMMAND ----------
56+
57+
select callType, count(*) as count
58+
from demo_db.fire_service_calls_tbl
59+
where callType is not null
60+
group by callType
61+
order by count desc
62+
63+
-- COMMAND ----------
64+
65+
-- MAGIC %md
66+
-- MAGIC ##### Q5. What zip codes accounted for most common calls?
67+
68+
-- COMMAND ----------
69+
70+
select callType, zipCode, count(*) as count
71+
from demo_db.fire_service_calls_tbl
72+
where callType is not null
73+
group by callType, zipCode
74+
order by count desc
75+
76+
-- COMMAND ----------
77+
78+
-- MAGIC %md
79+
-- MAGIC ##### Q6. What San Francisco neighborhoods are in the zip codes 94102 and 94103?
80+
81+
-- COMMAND ----------
82+
83+
select zipCode, neighborhood
84+
from demo_db.fire_service_calls_tbl
85+
where zipCode == 94102 or zipCode == 94103
86+
87+
-- COMMAND ----------
88+
89+
-- MAGIC %md
90+
-- MAGIC #####Q7. What was the sum of all call alarms, average, min, and max of the call response times?
91+
92+
-- COMMAND ----------
93+
94+
select sum(NumAlarms), avg(Delay), min(Delay), max(Delay)
95+
from demo_db.fire_service_calls_tbl
96+
97+
-- COMMAND ----------
98+
99+
-- MAGIC %md
100+
-- MAGIC ##### Q8. How many distinct years of data is in the data set?
101+
102+
-- COMMAND ----------
103+
104+
select distinct year(to_date(callDate, "MM/dd/yyyy")) as year_num
105+
from demo_db.fire_service_calls_tbl
106+
order by year_num
107+
108+
-- COMMAND ----------
109+
110+
-- MAGIC %md
111+
-- MAGIC ##### Q9. What week of the year in 2018 had the most fire calls?
112+
113+
-- COMMAND ----------
114+
115+
select weekofyear(to_date(callDate, "MM/dd/yyyy")) week_year, count(*) as count
116+
from demo_db.fire_service_calls_tbl
117+
where year(to_date(callDate, "MM/dd/yyyy")) == 2018
118+
group by week_year
119+
order by count desc
120+
121+
-- COMMAND ----------
122+
123+
-- MAGIC %md
124+
-- MAGIC ##### Q10. What neighborhoods in San Francisco had the worst response time in 2018?
125+
126+
-- COMMAND ----------
127+
128+
select neighborhood, delay
129+
from demo_db.fire_service_calls_tbl
130+
where year(to_date(callDate, "MM/dd/yyyy")) == 2018
131+
order by delay desc
132+
133+
-- COMMAND ----------
134+
135+

‎05-working-with-dataframe.ipynb

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

‎HelloSpark.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from pyspark.sql import *
2+
3+
if __name__ == "__main__":
4+
5+
spark = SparkSession.builder \
6+
.appName("Hello Spark") \
7+
.master("local[2]") \
8+
.getOrCreate()
9+
10+
data_list = [("Ravi", 28),
11+
("David", 45),
12+
("Abdul", 27)]
13+
14+
15+
16+
df = spark.createDataFrame(data_list).toDF("Name", "Age")
17+
df.show()

0 commit comments

Comments
 (0)
Please sign in to comment.