Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e0fd048

Browse files
committedJul 3, 2024
fix: Add missing solutions to questions 5-10 for the 24. More Dataframe Transformations Lesson
Add missing solutions to questions 5-10 for the 24. More Dataframe Transformations Lesson This corresponds to the 05-working-with-dataframe.ipynb notebook. I also added order by Delay desc to the Q10 SQL since that matched the video and also didn't match the given question. Signed-off-by: mvaal <mvaal@expediagroup.com>
1 parent 33d16ad commit e0fd048

File tree

1 file changed

+779
-1
lines changed

1 file changed

+779
-1
lines changed
 

‎05-working-with-dataframe.ipynb

Lines changed: 779 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,779 @@
1-
{"cells":[{"cell_type":"code","source":["from pyspark.sql.functions import *"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"714857fa-35e9-45e3-a6d3-1d16a0cc7f91"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["raw_fire_df = spark.read \\\n .format(\"csv\") \\\n .option(\"header\", \"true\") \\\n .option(\"inferSchema\",\"true\") \\\n .load(\"/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6fac9a4c-8d5f-4eda-aca1-180eb49086c1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(raw_fire_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"28a6868e-f4be-4406-ae8d-f67613d9f493"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["renamed_fire_df = raw_fire_df \\\n .withColumnRenamed(\"Call Number\", \"CallNumber\") \\\n .withColumnRenamed(\"Unit ID\", \"UnitID\") \\\n .withColumnRenamed(\"Incident Number\", \"IncidentNumber\") \\\n .withColumnRenamed(\"Call Date\", \"CallDate\") \\\n .withColumnRenamed(\"Watch Date\", \"WatchDate\") \\\n .withColumnRenamed(\"Call Final Disposition\", \"CallFinalDisposition\") \\\n .withColumnRenamed(\"Available DtTm\", \"AvailableDtTm\") \\\n .withColumnRenamed(\"Zipcode of Incident\", \"Zipcode\") \\\n .withColumnRenamed(\"Station Area\", \"StationArea\") \\\n .withColumnRenamed(\"Final Priority\", \"FinalPriority\") \\\n .withColumnRenamed(\"ALS Unit\", \"ALSUnit\") \\\n .withColumnRenamed(\"Call Type Group\", \"CallTypeGroup\") \\\n .withColumnRenamed(\"Unit sequence in call dispatch\", \"UnitSequenceInCallDispatch\") \\\n .withColumnRenamed(\"Fire Prevention District\", \"FirePreventionDistrict\") \\\n .withColumnRenamed(\"Supervisor District\", \"SupervisorDistrict\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3a45840e-00d7-403c-bcb5-f4c2b0e1dbba"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(renamed_fire_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"34acbca6-5403-4389-812b-47d79aae4d6d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["renamed_fire_df.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"12343768-27f0-4b8c-bd7d-e43ea856ea6a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["fire_df = renamed_fire_df \\\n .withColumn(\"CallDate\", to_date(\"CallDate\", \"MM/dd/yyyy\")) \\\n .withColumn(\"WatchDate\", to_date(\"WatchDate\", \"MM/dd/yyyy\")) \\\n .withColumn(\"AvailableDtTm\", to_timestamp(\"AvailableDtTm\", \"MM/dd/yyyy hh:mm:ss a\")) \\\n .withColumn(\"Delay\", round(\"Delay\", 2))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1bfbd70e-ee18-49d6-affd-bc4033116772"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(fire_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b2605e0c-3bdb-4e27-ad86-939c80bd3a9d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["fire_df.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"4ed3cca5-aac7-4529-bbec-8c8c85e11b5d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["fire_df.cache()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"2cf3909b-63a5-4140-a744-b3ff9213544e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q1. How many distinct types of calls were made to the Fire Department?\n```SQL\nselect count(distinct CallType) as distinct_call_type_count\nfrom fire_service_calls_tbl\nwhere CallType is not null\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7596246e-e79c-4e85-81eb-d05047216212"}}},{"cell_type":"code","source":["fire_df.createOrReplaceTempView(\"fire_service_calls_view\")\nq1_sql_df = spark.sql(\"\"\"\n select count(distinct CallType) as distinct_call_type_count\n from fire_service_calls_view\n where CallType is not null\n \"\"\")\ndisplay(q1_sql_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"92360fa0-13d4-4834-bfc6-bf462125e615"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["q1_df = fire_df.where(\"CallType is not null\") \\\n .select(\"CallType\") \\\n .distinct()\nprint(q1_df.count())"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6efce8ff-052a-4bdf-9c1c-0602a4837653"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["q1_df1 = fire_df.where(\"CallType is not null\")\nq1_df2 = q1_df1.select(\"CallType\")\nq1_df3 = q1_df2.distinct()\nprint(q1_df3.count())"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"cbdf0db8-e0a3-41b7-b672-b134b6c811ba"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q2. What were distinct types of calls made to the Fire Department?\n```sql\nselect distinct CallType as distinct_call_types\nfrom fire_service_calls_tbl\nwhere CallType is not null\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7d4f11c9-374c-41df-b4bb-9a0d812a7975"}}},{"cell_type":"code","source":["q2_df = fire_df.where(\"CallType is not null\") \\\n .select(expr(\"CallType as distinct_call_type\")) \\\n .distinct()\nq2_df.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7af8c2e5-864b-4196-8315-2ef9f9d82c64"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"code","source":["display(q2_df)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"822d62b3-8c09-4237-92fb-80a855c202c1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q3. Find out all response for delayed times greater than 5 mins?\n``` sql\nselect CallNumber, Delay\nfrom fire_service_calls_tbl\nwhere Delay > 5\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"85b11023-f976-465a-9aed-0d7de4383722"}}},{"cell_type":"code","source":["fire_df.where(\"Delay > 5\") \\\n .select(\"CallNumber\", \"Delay\") \\\n .show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"06543f39-21ea-4bda-a228-03257788c5f9"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q4. What were the most common call types?\n```sql\nselect CallType, count(*) as count\nfrom fire_service_calls_tbl\nwhere CallType is not null\ngroup by CallType\norder by count desc\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d682e1e3-28f2-4cb1-87af-3b2a41fa1f9f"}}},{"cell_type":"code","source":["fire_df.select(\"CallType\") \\\n .where(\"CallType is not null\") \\\n .groupBy(\"CallType\") \\\n .count() \\\n .orderBy(\"count\", ascending=False) \\\n .show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b8c3327f-104b-4e9e-b941-9b0a57571d0b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q5. What zip codes accounted for most common calls?\n```sql\nselect CallType, ZipCode, count(*) as count\nfrom fire_service_calls_tbl\nwhere CallType is not null\ngroup by CallType, Zipcode\norder by count desc\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"05668ab2-6c49-46f7-bf75-05bcaa7ca666"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c88d0eb0-fd17-424e-bc49-d0a2231d7d06"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q6. What San Francisco neighborhoods are in the zip codes 94102 and 94103\n```sql\nselect distinct Neighborhood, Zipcode\nfrom fire_service_calls_tbl\nwhere Zipcode== 94102 or Zipcode == 94103\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"48e6bc0a-bcec-4d20-a165-9079aa74adb2"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"21ff24e5-845d-42ad-bba1-157f7cb1dab0"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q7. What was the sum of all calls, average, min and max of the response times for calls?\n```sql\nselect sum(NumAlarms), avg(Delay), min(Delay), max(Delay)\nfrom fire_service_calls_tbl\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"bb33f91e-0d67-414b-b056-88fc38b6a6bd"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"aa54b23f-6b28-4b07-80c8-38c319216d11"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q8. How many distinct years of data is in the CSV file?\n```sql\nselect distinct year(to_timestamp(CallDate, \"MM/dd/yyyy\")) as year_num\nfrom fire_service_calls_tbl\norder by year_num\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"8894d83e-905c-4b33-85ba-a85c0b41b5cb"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"cc09d106-3298-459a-905a-7efaea20437a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q9. What week of the year in 2018 had the most fire calls?\n```sql\nselect weekofyear(to_timestamp(CallDate, \"MM/dd/yyyy\")) week_year, count(*) as count\nfrom fire_service_calls_tbl \nwhere year(to_timestamp(CallDate, \"MM/dd/yyyy\")) == 2018\ngroup by week_year\norder by count desc\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b52c7b9e-6493-4572-85e0-4bd5e8534372"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f8aa82ee-5140-4e78-9d78-35b0c8d40385"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0},{"cell_type":"markdown","source":["##### Q10. What neighborhoods in San Francisco had the worst response time in 2018?\n```sql\nselect Neighborhood, Delay\nfrom fire_service_calls_tbl \nwhere year(to_timestamp(CallDate, \"MM/dd/yyyy\")) == 2018\n```"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"63144f63-f0e7-4361-97ca-78173498fc2f"}}},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c03714de-f4af-407f-9227-dae8df634a4a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"errorTraceType":null,"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":["<style scoped>\n .ansiout {\n display: block;\n unicode-bidi: embed;\n white-space: pre-wrap;\n word-wrap: break-word;\n word-break: break-all;\n font-family: \"Source Code Pro\", \"Menlo\", monospace;;\n font-size: 13px;\n color: #555;\n margin-left: 4px;\n line-height: 19px;\n }\n</style>"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"05-working-with-dataframe","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":384359114294295}},"nbformat":4,"nbformat_minor":0}
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 0,
6+
"metadata": {
7+
"application/vnd.databricks.v1+cell": {
8+
"cellMetadata": {
9+
"byteLimit": 2048000,
10+
"rowLimit": 10000
11+
},
12+
"inputWidgets": {},
13+
"nuid": "714857fa-35e9-45e3-a6d3-1d16a0cc7f91",
14+
"showTitle": false,
15+
"title": ""
16+
}
17+
},
18+
"outputs": [],
19+
"source": [
20+
"from pyspark.sql.functions import *"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": 0,
26+
"metadata": {
27+
"application/vnd.databricks.v1+cell": {
28+
"cellMetadata": {
29+
"byteLimit": 2048000,
30+
"rowLimit": 10000
31+
},
32+
"inputWidgets": {},
33+
"nuid": "6fac9a4c-8d5f-4eda-aca1-180eb49086c1",
34+
"showTitle": false,
35+
"title": ""
36+
}
37+
},
38+
"outputs": [],
39+
"source": [
40+
"raw_fire_df = spark.read \\\n",
41+
" .format(\"csv\") \\\n",
42+
" .option(\"header\", \"true\") \\\n",
43+
" .option(\"inferSchema\",\"true\") \\\n",
44+
" .load(\"/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv\")"
45+
]
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": 0,
50+
"metadata": {
51+
"application/vnd.databricks.v1+cell": {
52+
"cellMetadata": {
53+
"byteLimit": 2048000,
54+
"rowLimit": 10000
55+
},
56+
"inputWidgets": {},
57+
"nuid": "28a6868e-f4be-4406-ae8d-f67613d9f493",
58+
"showTitle": false,
59+
"title": ""
60+
}
61+
},
62+
"outputs": [],
63+
"source": [
64+
"display(raw_fire_df)"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": 0,
70+
"metadata": {
71+
"application/vnd.databricks.v1+cell": {
72+
"cellMetadata": {
73+
"byteLimit": 2048000,
74+
"rowLimit": 10000
75+
},
76+
"inputWidgets": {},
77+
"nuid": "3a45840e-00d7-403c-bcb5-f4c2b0e1dbba",
78+
"showTitle": false,
79+
"title": ""
80+
}
81+
},
82+
"outputs": [],
83+
"source": [
84+
"renamed_fire_df = raw_fire_df \\\n",
85+
" .withColumnRenamed(\"Call Number\", \"CallNumber\") \\\n",
86+
" .withColumnRenamed(\"Unit ID\", \"UnitID\") \\\n",
87+
" .withColumnRenamed(\"Incident Number\", \"IncidentNumber\") \\\n",
88+
" .withColumnRenamed(\"Call Date\", \"CallDate\") \\\n",
89+
" .withColumnRenamed(\"Watch Date\", \"WatchDate\") \\\n",
90+
" .withColumnRenamed(\"Call Final Disposition\", \"CallFinalDisposition\") \\\n",
91+
" .withColumnRenamed(\"Available DtTm\", \"AvailableDtTm\") \\\n",
92+
" .withColumnRenamed(\"Zipcode of Incident\", \"Zipcode\") \\\n",
93+
" .withColumnRenamed(\"Station Area\", \"StationArea\") \\\n",
94+
" .withColumnRenamed(\"Final Priority\", \"FinalPriority\") \\\n",
95+
" .withColumnRenamed(\"ALS Unit\", \"ALSUnit\") \\\n",
96+
" .withColumnRenamed(\"Call Type Group\", \"CallTypeGroup\") \\\n",
97+
" .withColumnRenamed(\"Unit sequence in call dispatch\", \"UnitSequenceInCallDispatch\") \\\n",
98+
" .withColumnRenamed(\"Fire Prevention District\", \"FirePreventionDistrict\") \\\n",
99+
" .withColumnRenamed(\"Supervisor District\", \"SupervisorDistrict\")"
100+
]
101+
},
102+
{
103+
"cell_type": "code",
104+
"execution_count": 0,
105+
"metadata": {
106+
"application/vnd.databricks.v1+cell": {
107+
"cellMetadata": {
108+
"byteLimit": 2048000,
109+
"rowLimit": 10000
110+
},
111+
"inputWidgets": {},
112+
"nuid": "34acbca6-5403-4389-812b-47d79aae4d6d",
113+
"showTitle": false,
114+
"title": ""
115+
}
116+
},
117+
"outputs": [],
118+
"source": [
119+
"display(renamed_fire_df)"
120+
]
121+
},
122+
{
123+
"cell_type": "code",
124+
"execution_count": 0,
125+
"metadata": {
126+
"application/vnd.databricks.v1+cell": {
127+
"cellMetadata": {
128+
"byteLimit": 2048000,
129+
"rowLimit": 10000
130+
},
131+
"inputWidgets": {},
132+
"nuid": "12343768-27f0-4b8c-bd7d-e43ea856ea6a",
133+
"showTitle": false,
134+
"title": ""
135+
}
136+
},
137+
"outputs": [],
138+
"source": [
139+
"renamed_fire_df.printSchema()"
140+
]
141+
},
142+
{
143+
"cell_type": "code",
144+
"execution_count": 0,
145+
"metadata": {
146+
"application/vnd.databricks.v1+cell": {
147+
"cellMetadata": {
148+
"byteLimit": 2048000,
149+
"rowLimit": 10000
150+
},
151+
"inputWidgets": {},
152+
"nuid": "1bfbd70e-ee18-49d6-affd-bc4033116772",
153+
"showTitle": false,
154+
"title": ""
155+
}
156+
},
157+
"outputs": [],
158+
"source": [
159+
"fire_df = renamed_fire_df \\\n",
160+
" .withColumn(\"CallDate\", to_date(\"CallDate\", \"MM/dd/yyyy\")) \\\n",
161+
" .withColumn(\"WatchDate\", to_date(\"WatchDate\", \"MM/dd/yyyy\")) \\\n",
162+
" .withColumn(\"AvailableDtTm\", to_timestamp(\"AvailableDtTm\", \"MM/dd/yyyy hh:mm:ss a\")) \\\n",
163+
" .withColumn(\"Delay\", round(\"Delay\", 2))"
164+
]
165+
},
166+
{
167+
"cell_type": "code",
168+
"execution_count": 0,
169+
"metadata": {
170+
"application/vnd.databricks.v1+cell": {
171+
"cellMetadata": {
172+
"byteLimit": 2048000,
173+
"rowLimit": 10000
174+
},
175+
"inputWidgets": {},
176+
"nuid": "b2605e0c-3bdb-4e27-ad86-939c80bd3a9d",
177+
"showTitle": false,
178+
"title": ""
179+
}
180+
},
181+
"outputs": [],
182+
"source": [
183+
"display(fire_df)"
184+
]
185+
},
186+
{
187+
"cell_type": "code",
188+
"execution_count": 0,
189+
"metadata": {
190+
"application/vnd.databricks.v1+cell": {
191+
"cellMetadata": {
192+
"byteLimit": 2048000,
193+
"rowLimit": 10000
194+
},
195+
"inputWidgets": {},
196+
"nuid": "4ed3cca5-aac7-4529-bbec-8c8c85e11b5d",
197+
"showTitle": false,
198+
"title": ""
199+
}
200+
},
201+
"outputs": [],
202+
"source": [
203+
"fire_df.printSchema()"
204+
]
205+
},
206+
{
207+
"cell_type": "code",
208+
"execution_count": 0,
209+
"metadata": {
210+
"application/vnd.databricks.v1+cell": {
211+
"cellMetadata": {
212+
"byteLimit": 2048000,
213+
"rowLimit": 10000
214+
},
215+
"inputWidgets": {},
216+
"nuid": "2cf3909b-63a5-4140-a744-b3ff9213544e",
217+
"showTitle": false,
218+
"title": ""
219+
}
220+
},
221+
"outputs": [],
222+
"source": [
223+
"fire_df.cache()"
224+
]
225+
},
226+
{
227+
"cell_type": "markdown",
228+
"metadata": {
229+
"application/vnd.databricks.v1+cell": {
230+
"cellMetadata": {
231+
"byteLimit": 2048000,
232+
"rowLimit": 10000
233+
},
234+
"inputWidgets": {},
235+
"nuid": "7596246e-e79c-4e85-81eb-d05047216212",
236+
"showTitle": false,
237+
"title": ""
238+
}
239+
},
240+
"source": [
241+
"##### Q1. How many distinct types of calls were made to the Fire Department?\n",
242+
"```SQL\n",
243+
"select count(distinct CallType) as distinct_call_type_count\n",
244+
"from fire_service_calls_tbl\n",
245+
"where CallType is not null\n",
246+
"```"
247+
]
248+
},
249+
{
250+
"cell_type": "code",
251+
"execution_count": 0,
252+
"metadata": {
253+
"application/vnd.databricks.v1+cell": {
254+
"cellMetadata": {
255+
"byteLimit": 2048000,
256+
"rowLimit": 10000
257+
},
258+
"inputWidgets": {},
259+
"nuid": "92360fa0-13d4-4834-bfc6-bf462125e615",
260+
"showTitle": false,
261+
"title": ""
262+
}
263+
},
264+
"outputs": [],
265+
"source": [
266+
"fire_df.createOrReplaceTempView(\"fire_service_calls_view\")\n",
267+
"q1_sql_df = spark.sql(\"\"\"\n",
268+
" select count(distinct CallType) as distinct_call_type_count\n",
269+
" from fire_service_calls_view\n",
270+
" where CallType is not null\n",
271+
" \"\"\")\n",
272+
"display(q1_sql_df)"
273+
]
274+
},
275+
{
276+
"cell_type": "code",
277+
"execution_count": 0,
278+
"metadata": {
279+
"application/vnd.databricks.v1+cell": {
280+
"cellMetadata": {
281+
"byteLimit": 2048000,
282+
"rowLimit": 10000
283+
},
284+
"inputWidgets": {},
285+
"nuid": "6efce8ff-052a-4bdf-9c1c-0602a4837653",
286+
"showTitle": false,
287+
"title": ""
288+
}
289+
},
290+
"outputs": [],
291+
"source": [
292+
"q1_df = fire_df.where(\"CallType is not null\") \\\n",
293+
" .select(\"CallType\") \\\n",
294+
" .distinct()\n",
295+
"print(q1_df.count())"
296+
]
297+
},
298+
{
299+
"cell_type": "code",
300+
"execution_count": 0,
301+
"metadata": {
302+
"application/vnd.databricks.v1+cell": {
303+
"cellMetadata": {
304+
"byteLimit": 2048000,
305+
"rowLimit": 10000
306+
},
307+
"inputWidgets": {},
308+
"nuid": "cbdf0db8-e0a3-41b7-b672-b134b6c811ba",
309+
"showTitle": false,
310+
"title": ""
311+
}
312+
},
313+
"outputs": [],
314+
"source": [
315+
"q1_df1 = fire_df.where(\"CallType is not null\")\n",
316+
"q1_df2 = q1_df1.select(\"CallType\")\n",
317+
"q1_df3 = q1_df2.distinct()\n",
318+
"print(q1_df3.count())"
319+
]
320+
},
321+
{
322+
"cell_type": "markdown",
323+
"metadata": {
324+
"application/vnd.databricks.v1+cell": {
325+
"cellMetadata": {
326+
"byteLimit": 2048000,
327+
"rowLimit": 10000
328+
},
329+
"inputWidgets": {},
330+
"nuid": "7d4f11c9-374c-41df-b4bb-9a0d812a7975",
331+
"showTitle": false,
332+
"title": ""
333+
}
334+
},
335+
"source": [
336+
"##### Q2. What were distinct types of calls made to the Fire Department?\n",
337+
"```sql\n",
338+
"select distinct CallType as distinct_call_types\n",
339+
"from fire_service_calls_tbl\n",
340+
"where CallType is not null\n",
341+
"```"
342+
]
343+
},
344+
{
345+
"cell_type": "code",
346+
"execution_count": 0,
347+
"metadata": {
348+
"application/vnd.databricks.v1+cell": {
349+
"cellMetadata": {
350+
"byteLimit": 2048000,
351+
"rowLimit": 10000
352+
},
353+
"inputWidgets": {},
354+
"nuid": "7af8c2e5-864b-4196-8315-2ef9f9d82c64",
355+
"showTitle": false,
356+
"title": ""
357+
}
358+
},
359+
"outputs": [],
360+
"source": [
361+
"q2_df = fire_df.where(\"CallType is not null\") \\\n",
362+
" .select(expr(\"CallType as distinct_call_type\")) \\\n",
363+
" .distinct()\n",
364+
"q2_df.show()"
365+
]
366+
},
367+
{
368+
"cell_type": "code",
369+
"execution_count": 0,
370+
"metadata": {
371+
"application/vnd.databricks.v1+cell": {
372+
"cellMetadata": {
373+
"byteLimit": 2048000,
374+
"rowLimit": 10000
375+
},
376+
"inputWidgets": {},
377+
"nuid": "822d62b3-8c09-4237-92fb-80a855c202c1",
378+
"showTitle": false,
379+
"title": ""
380+
}
381+
},
382+
"outputs": [],
383+
"source": [
384+
"display(q2_df)"
385+
]
386+
},
387+
{
388+
"cell_type": "markdown",
389+
"metadata": {
390+
"application/vnd.databricks.v1+cell": {
391+
"cellMetadata": {
392+
"byteLimit": 2048000,
393+
"rowLimit": 10000
394+
},
395+
"inputWidgets": {},
396+
"nuid": "85b11023-f976-465a-9aed-0d7de4383722",
397+
"showTitle": false,
398+
"title": ""
399+
}
400+
},
401+
"source": [
402+
"##### Q3. Find out all response for delayed times greater than 5 mins?\n",
403+
"``` sql\n",
404+
"select CallNumber, Delay\n",
405+
"from fire_service_calls_tbl\n",
406+
"where Delay > 5\n",
407+
"```"
408+
]
409+
},
410+
{
411+
"cell_type": "code",
412+
"execution_count": 0,
413+
"metadata": {
414+
"application/vnd.databricks.v1+cell": {
415+
"cellMetadata": {
416+
"byteLimit": 2048000,
417+
"rowLimit": 10000
418+
},
419+
"inputWidgets": {},
420+
"nuid": "06543f39-21ea-4bda-a228-03257788c5f9",
421+
"showTitle": false,
422+
"title": ""
423+
}
424+
},
425+
"outputs": [],
426+
"source": [
427+
"fire_df.where(\"Delay > 5\") \\\n",
428+
" .select(\"CallNumber\", \"Delay\") \\\n",
429+
" .show()"
430+
]
431+
},
432+
{
433+
"cell_type": "markdown",
434+
"metadata": {
435+
"application/vnd.databricks.v1+cell": {
436+
"cellMetadata": {
437+
"byteLimit": 2048000,
438+
"rowLimit": 10000
439+
},
440+
"inputWidgets": {},
441+
"nuid": "d682e1e3-28f2-4cb1-87af-3b2a41fa1f9f",
442+
"showTitle": false,
443+
"title": ""
444+
}
445+
},
446+
"source": [
447+
"##### Q4. What were the most common call types?\n",
448+
"```sql\n",
449+
"select CallType, count(*) as count\n",
450+
"from fire_service_calls_tbl\n",
451+
"where CallType is not null\n",
452+
"group by CallType\n",
453+
"order by count desc\n",
454+
"```"
455+
]
456+
},
457+
{
458+
"cell_type": "code",
459+
"execution_count": 0,
460+
"metadata": {
461+
"application/vnd.databricks.v1+cell": {
462+
"cellMetadata": {
463+
"byteLimit": 2048000,
464+
"rowLimit": 10000
465+
},
466+
"inputWidgets": {},
467+
"nuid": "b8c3327f-104b-4e9e-b941-9b0a57571d0b",
468+
"showTitle": false,
469+
"title": ""
470+
}
471+
},
472+
"outputs": [],
473+
"source": [
474+
"fire_df.select(\"CallType\") \\\n",
475+
" .where(\"CallType is not null\") \\\n",
476+
" .groupBy(\"CallType\") \\\n",
477+
" .count() \\\n",
478+
" .orderBy(\"count\", ascending=False) \\\n",
479+
" .show()"
480+
]
481+
},
482+
{
483+
"cell_type": "markdown",
484+
"metadata": {
485+
"application/vnd.databricks.v1+cell": {
486+
"cellMetadata": {
487+
"byteLimit": 2048000,
488+
"rowLimit": 10000
489+
},
490+
"inputWidgets": {},
491+
"nuid": "05668ab2-6c49-46f7-bf75-05bcaa7ca666",
492+
"showTitle": false,
493+
"title": ""
494+
}
495+
},
496+
"source": [
497+
"##### Q5. What zip codes accounted for most common calls?\n",
498+
"```sql\n",
499+
"select CallType, ZipCode, count(*) as count\n",
500+
"from fire_service_calls_tbl\n",
501+
"where CallType is not null\n",
502+
"group by CallType, Zipcode\n",
503+
"order by count desc\n",
504+
"```"
505+
]
506+
},
507+
{
508+
"cell_type": "code",
509+
"execution_count": 0,
510+
"metadata": {
511+
"application/vnd.databricks.v1+cell": {
512+
"cellMetadata": {
513+
"byteLimit": 2048000,
514+
"rowLimit": 10000
515+
},
516+
"inputWidgets": {},
517+
"nuid": "c88d0eb0-fd17-424e-bc49-d0a2231d7d06",
518+
"showTitle": false,
519+
"title": ""
520+
}
521+
},
522+
"outputs": [],
523+
"source": [
524+
"fire_df.where(\"CallType is not null\") \\\n",
525+
" .select(\"CallType\", \"ZipCode\") \\\n",
526+
" .groupBy(\"CallType\", \"ZipCode\") \\\n",
527+
" .count() \\\n",
528+
" .orderBy(\"count\", ascending=False) \\\n",
529+
" .show()"
530+
]
531+
},
532+
{
533+
"cell_type": "markdown",
534+
"metadata": {
535+
"application/vnd.databricks.v1+cell": {
536+
"cellMetadata": {
537+
"byteLimit": 2048000,
538+
"rowLimit": 10000
539+
},
540+
"inputWidgets": {},
541+
"nuid": "48e6bc0a-bcec-4d20-a165-9079aa74adb2",
542+
"showTitle": false,
543+
"title": ""
544+
}
545+
},
546+
"source": [
547+
"##### Q6. What San Francisco neighborhoods are in the zip codes 94102 and 94103\n",
548+
"```sql\n",
549+
"select distinct Neighborhood, Zipcode\n",
550+
"from fire_service_calls_tbl\n",
551+
"where Zipcode== 94102 or Zipcode == 94103\n",
552+
"```"
553+
]
554+
},
555+
{
556+
"cell_type": "code",
557+
"execution_count": 0,
558+
"metadata": {
559+
"application/vnd.databricks.v1+cell": {
560+
"cellMetadata": {
561+
"byteLimit": 2048000,
562+
"rowLimit": 10000
563+
},
564+
"inputWidgets": {},
565+
"nuid": "21ff24e5-845d-42ad-bba1-157f7cb1dab0",
566+
"showTitle": false,
567+
"title": ""
568+
}
569+
},
570+
"outputs": [],
571+
"source": [
572+
"fire_df.where(\"ZipCode==94102 or ZipCode==94103\") \\\n",
573+
" .select(\"Neighborhood\", \"Zipcode\") \\\n",
574+
" .distinct() \\\n",
575+
" .show()"
576+
]
577+
},
578+
{
579+
"cell_type": "markdown",
580+
"metadata": {
581+
"application/vnd.databricks.v1+cell": {
582+
"cellMetadata": {
583+
"byteLimit": 2048000,
584+
"rowLimit": 10000
585+
},
586+
"inputWidgets": {},
587+
"nuid": "bb33f91e-0d67-414b-b056-88fc38b6a6bd",
588+
"showTitle": false,
589+
"title": ""
590+
}
591+
},
592+
"source": [
593+
"##### Q7. What was the sum of all calls, average, min and max of the response times for calls?\n",
594+
"```sql\n",
595+
"select sum(NumAlarms), avg(Delay), min(Delay), max(Delay)\n",
596+
"from fire_service_calls_tbl\n",
597+
"```"
598+
]
599+
},
600+
{
601+
"cell_type": "code",
602+
"execution_count": 0,
603+
"metadata": {
604+
"application/vnd.databricks.v1+cell": {
605+
"cellMetadata": {
606+
"byteLimit": 2048000,
607+
"rowLimit": 10000
608+
},
609+
"inputWidgets": {},
610+
"nuid": "aa54b23f-6b28-4b07-80c8-38c319216d11",
611+
"showTitle": false,
612+
"title": ""
613+
}
614+
},
615+
"outputs": [],
616+
"source": [
617+
"fire_df.select(sum(\"NumAlarms\"), avg(\"Delay\"), min(\"Delay\"), max(\"Delay\")) \\\n",
618+
" .show()"
619+
]
620+
},
621+
{
622+
"cell_type": "markdown",
623+
"metadata": {
624+
"application/vnd.databricks.v1+cell": {
625+
"cellMetadata": {
626+
"byteLimit": 2048000,
627+
"rowLimit": 10000
628+
},
629+
"inputWidgets": {},
630+
"nuid": "8894d83e-905c-4b33-85ba-a85c0b41b5cb",
631+
"showTitle": false,
632+
"title": ""
633+
}
634+
},
635+
"source": [
636+
"##### Q8. How many distinct years of data is in the CSV file?\n",
637+
"```sql\n",
638+
"select distinct year(to_timestamp(CallDate, \"MM/dd/yyyy\")) as year_num\n",
639+
"from fire_service_calls_tbl\n",
640+
"order by year_num\n",
641+
"```"
642+
]
643+
},
644+
{
645+
"cell_type": "code",
646+
"execution_count": 0,
647+
"metadata": {
648+
"application/vnd.databricks.v1+cell": {
649+
"cellMetadata": {
650+
"byteLimit": 2048000,
651+
"rowLimit": 10000
652+
},
653+
"inputWidgets": {},
654+
"nuid": "cc09d106-3298-459a-905a-7efaea20437a",
655+
"showTitle": false,
656+
"title": ""
657+
}
658+
},
659+
"outputs": [],
660+
"source": [
661+
"fire_df.select(year(\"CallDate\").alias(\"year_num\")) \\\n",
662+
" .distinct() \\\n",
663+
" .orderBy(\"year_num\") \\\n",
664+
" .show()"
665+
]
666+
},
667+
{
668+
"cell_type": "markdown",
669+
"metadata": {
670+
"application/vnd.databricks.v1+cell": {
671+
"cellMetadata": {
672+
"byteLimit": 2048000,
673+
"rowLimit": 10000
674+
},
675+
"inputWidgets": {},
676+
"nuid": "b52c7b9e-6493-4572-85e0-4bd5e8534372",
677+
"showTitle": false,
678+
"title": ""
679+
}
680+
},
681+
"source": [
682+
"##### Q9. What week of the year in 2018 had the most fire calls?\n",
683+
"```sql\n",
684+
"select weekofyear(to_timestamp(CallDate, \"MM/dd/yyyy\")) week_year, count(*) as count\n",
685+
"from fire_service_calls_tbl \n",
686+
"where year(to_timestamp(CallDate, \"MM/dd/yyyy\")) == 2018\n",
687+
"group by week_year\n",
688+
"order by count desc\n",
689+
"```"
690+
]
691+
},
692+
{
693+
"cell_type": "code",
694+
"execution_count": 0,
695+
"metadata": {
696+
"application/vnd.databricks.v1+cell": {
697+
"cellMetadata": {
698+
"byteLimit": 2048000,
699+
"rowLimit": 10000
700+
},
701+
"inputWidgets": {},
702+
"nuid": "f8aa82ee-5140-4e78-9d78-35b0c8d40385",
703+
"showTitle": false,
704+
"title": ""
705+
}
706+
},
707+
"outputs": [],
708+
"source": [
709+
"fire_df.select(weekofyear(\"CallDate\").alias(\"week_year\")) \\\n",
710+
" .where(year(\"CallDate\") == 2018) \\\n",
711+
" .groupBy(\"week_year\") \\\n",
712+
" .count() \\\n",
713+
" .orderBy(\"count\", ascending=False) \\\n",
714+
" .show()"
715+
]
716+
},
717+
{
718+
"cell_type": "markdown",
719+
"metadata": {
720+
"application/vnd.databricks.v1+cell": {
721+
"cellMetadata": {
722+
"byteLimit": 2048000,
723+
"rowLimit": 10000
724+
},
725+
"inputWidgets": {},
726+
"nuid": "63144f63-f0e7-4361-97ca-78173498fc2f",
727+
"showTitle": false,
728+
"title": ""
729+
}
730+
},
731+
"source": [
732+
"##### Q10. What neighborhoods in San Francisco had the worst response time in 2018?\n",
733+
"```sql\n",
734+
"select Neighborhood, Delay\n",
735+
"from fire_service_calls_tbl \n",
736+
"where year(to_timestamp(CallDate, \"MM/dd/yyyy\")) == 2018\n",
737+
"order by Delay desc\n",
738+
"```"
739+
]
740+
},
741+
{
742+
"cell_type": "code",
743+
"execution_count": 0,
744+
"metadata": {
745+
"application/vnd.databricks.v1+cell": {
746+
"cellMetadata": {
747+
"byteLimit": 2048000,
748+
"rowLimit": 10000
749+
},
750+
"inputWidgets": {},
751+
"nuid": "c03714de-f4af-407f-9227-dae8df634a4a",
752+
"showTitle": false,
753+
"title": ""
754+
}
755+
},
756+
"outputs": [],
757+
"source": [
758+
"fire_df.select(\"Neighborhood\", \"Delay\") \\\n",
759+
" .where(year(\"CallDate\") == 2018) \\\n",
760+
" .orderBy(\"Delay\", ascending=False) \\\n",
761+
" .show()"
762+
]
763+
}
764+
],
765+
"metadata": {
766+
"application/vnd.databricks.v1+notebook": {
767+
"dashboards": [],
768+
"environmentMetadata": null,
769+
"language": "python",
770+
"notebookMetadata": {
771+
"pythonIndentUnit": 4
772+
},
773+
"notebookName": "05-working-with-dataframe (1)",
774+
"widgets": {}
775+
}
776+
},
777+
"nbformat": 4,
778+
"nbformat_minor": 0
779+
}

0 commit comments

Comments
 (0)
Please sign in to comment.