Skip to content

Commit 820ff58

Browse files
committed
week 2
1 parent c36a21f commit 820ff58

6 files changed

+2845
-0
lines changed

Week_2/S2E4_Regresion_Lineal.ipynb

+500
Large diffs are not rendered by default.

Week_2/S2E5_Creacion_Variables.ipynb

+355
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,355 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"collapsed": true
7+
},
8+
"source": [
9+
"# Creación de Nuevas Variables - Feature Extraction"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 1,
15+
"metadata": {
16+
"collapsed": true
17+
},
18+
"outputs": [],
19+
"source": [
20+
"#from pyspark import SparkContext\n",
21+
"#sc = SparkContext()\n",
22+
"#from pyspark.sql import SQLContext\n",
23+
"#sqlContext=SQLContext(sc)"
24+
]
25+
},
26+
{
27+
"cell_type": "code",
28+
"execution_count": 2,
29+
"metadata": {
30+
"collapsed": true
31+
},
32+
"outputs": [],
33+
"source": [
34+
"bd5 = sqlContext.read.format(\n",
35+
" \"com.databricks.spark.csv\"\n",
36+
").option(\"header\", \"true\").load(\"bd5.csv\", inferSchema=True)\n",
37+
"sqlContext.registerDataFrameAsTable(bd5, \"bd5\")"
38+
]
39+
},
40+
{
41+
"cell_type": "markdown",
42+
"metadata": {},
43+
"source": [
44+
"## Variables Dummy"
45+
]
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": 3,
50+
"metadata": {
51+
"collapsed": true
52+
},
53+
"outputs": [],
54+
"source": [
55+
"bd5 = bd5.withColumn('Horario1',(bd5.Horario==1) \n",
56+
").withColumn('Horario2',(bd5.Horario==2) \n",
57+
").withColumn('Horario3',(bd5.Horario==3))"
58+
]
59+
},
60+
{
61+
"cell_type": "markdown",
62+
"metadata": {},
63+
"source": [
64+
"## Variables Discretizadas Binarias"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": 4,
70+
"metadata": {
71+
"collapsed": false
72+
},
73+
"outputs": [
74+
{
75+
"data": {
76+
"text/plain": [
77+
"Row(YEAR=2016, MONTH=12, DAY_OF_MONTH=1, DAY_OF_WEEK=4, CRS_DEP_TIME=1440, OP_UNIQUE_CARRIER='AA', TAIL_NUM='N011AA', ARR_DELAY=-19.0, DEP_DELAY=-8.0, ORIGIN='LAS', DEST='LAX', DISTANCE=236.0, CANCELLED=0.0, DIVERTED=0.0, CARRIER_DELAY=0.0, WEATHER_DELAY=0.0, NAS_DELAY=0.0, SECURITY_DELAY=0.0, LATE_AIRCRAFT_DELAY=0.0, LogD=2.3729120029701067, Retraso=0, RetrasoNeto=-11.0, Horario=3, Horario1=False, Horario2=False, Horario3=True, SalidaBin=0.0)"
78+
]
79+
},
80+
"execution_count": 4,
81+
"metadata": {},
82+
"output_type": "execute_result"
83+
}
84+
],
85+
"source": [
86+
"from pyspark.ml.feature import Binarizer\n",
87+
"\n",
88+
"binarizer = Binarizer(threshold=15.0, inputCol='DEP_DELAY', outputCol='SalidaBin')\n",
89+
"binarizer.transform(bd5).head()\n"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": 5,
95+
"metadata": {
96+
"collapsed": false
97+
},
98+
"outputs": [
99+
{
100+
"name": "stdout",
101+
"output_type": "stream",
102+
"text": [
103+
"+---------+---------+\n",
104+
"|DEP_DELAY|SalidaBin|\n",
105+
"+---------+---------+\n",
106+
"| -8.0| 0.0|\n",
107+
"| 6.0| 0.0|\n",
108+
"| -5.0| 0.0|\n",
109+
"| -6.0| 0.0|\n",
110+
"| -5.0| 0.0|\n",
111+
"| -5.0| 0.0|\n",
112+
"| -8.0| 0.0|\n",
113+
"| -6.0| 0.0|\n",
114+
"| -3.0| 0.0|\n",
115+
"| -6.0| 0.0|\n",
116+
"| -11.0| 0.0|\n",
117+
"| 0.0| 0.0|\n",
118+
"| 1.0| 0.0|\n",
119+
"| 2.0| 0.0|\n",
120+
"| -9.0| 0.0|\n",
121+
"| 5.0| 0.0|\n",
122+
"| 7.0| 0.0|\n",
123+
"| -4.0| 0.0|\n",
124+
"| -2.0| 0.0|\n",
125+
"| -7.0| 0.0|\n",
126+
"+---------+---------+\n",
127+
"only showing top 20 rows\n",
128+
"\n"
129+
]
130+
}
131+
],
132+
"source": [
133+
"binarizer.transform(bd5).select('DEP_DELAY','SalidaBin').show()"
134+
]
135+
},
136+
{
137+
"cell_type": "markdown",
138+
"metadata": {},
139+
"source": [
140+
"## Variables Discretizadas en Buckets"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": 6,
146+
"metadata": {
147+
"collapsed": false,
148+
"scrolled": true
149+
},
150+
"outputs": [
151+
{
152+
"name": "stdout",
153+
"output_type": "stream",
154+
"text": [
155+
"+---------+---------+\n",
156+
"|DEP_DELAY|SalidaCat|\n",
157+
"+---------+---------+\n",
158+
"| -8.0| 0.0|\n",
159+
"| 6.0| 1.0|\n",
160+
"| -5.0| 0.0|\n",
161+
"| -6.0| 0.0|\n",
162+
"| -5.0| 0.0|\n",
163+
"| -5.0| 0.0|\n",
164+
"| -8.0| 0.0|\n",
165+
"| -6.0| 0.0|\n",
166+
"| -3.0| 0.0|\n",
167+
"| -6.0| 0.0|\n",
168+
"| -11.0| 0.0|\n",
169+
"| 0.0| 1.0|\n",
170+
"| 1.0| 1.0|\n",
171+
"| 2.0| 1.0|\n",
172+
"| -9.0| 0.0|\n",
173+
"| 5.0| 1.0|\n",
174+
"| 7.0| 1.0|\n",
175+
"| -4.0| 0.0|\n",
176+
"| -2.0| 0.0|\n",
177+
"| -7.0| 0.0|\n",
178+
"+---------+---------+\n",
179+
"only showing top 20 rows\n",
180+
"\n"
181+
]
182+
}
183+
],
184+
"source": [
185+
"from pyspark.ml.feature import Bucketizer\n",
186+
"bucketizer = Bucketizer(splits=[-float(\"inf\"), 0.0, 15.0, float(\"inf\")],\n",
187+
" inputCol='DEP_DELAY', outputCol='SalidaCat')\n",
188+
"bucketizer.transform(bd5).select('DEP_DELAY','SalidaCat').show()\n"
189+
]
190+
},
191+
{
192+
"cell_type": "markdown",
193+
"metadata": {},
194+
"source": [
195+
"Versiones más nuevas de Pyspark incluyen otras transformaciones, por ejemplo QuantileDiscretizer"
196+
]
197+
},
198+
{
199+
"cell_type": "markdown",
200+
"metadata": {},
201+
"source": [
202+
"## Expansión polinómica de Variables \n",
203+
"(términos cuadráticos, productos, etc.) "
204+
]
205+
},
206+
{
207+
"cell_type": "code",
208+
"execution_count": 7,
209+
"metadata": {
210+
"collapsed": false
211+
},
212+
"outputs": [
213+
{
214+
"data": {
215+
"text/plain": [
216+
"[Row(DEP_DELAY=-8.0, DISTANCE=236.0, Polyn=DenseVector([-8.0, 64.0, 236.0, -1888.0, 55696.0])),\n",
217+
" Row(DEP_DELAY=6.0, DISTANCE=236.0, Polyn=DenseVector([6.0, 36.0, 236.0, 1416.0, 55696.0])),\n",
218+
" Row(DEP_DELAY=-5.0, DISTANCE=236.0, Polyn=DenseVector([-5.0, 25.0, 236.0, -1180.0, 55696.0])),\n",
219+
" Row(DEP_DELAY=-6.0, DISTANCE=236.0, Polyn=DenseVector([-6.0, 36.0, 236.0, -1416.0, 55696.0])),\n",
220+
" Row(DEP_DELAY=-5.0, DISTANCE=651.0, Polyn=DenseVector([-5.0, 25.0, 651.0, -3255.0, 423801.0]))]"
221+
]
222+
},
223+
"execution_count": 7,
224+
"metadata": {},
225+
"output_type": "execute_result"
226+
}
227+
],
228+
"source": [
229+
"from pyspark.ml.feature import VectorAssembler\n",
230+
"from pyspark.ml.feature import PolynomialExpansion\n",
231+
"\n",
232+
"assembler = VectorAssembler(\n",
233+
" inputCols=['DEP_DELAY','DISTANCE'],\n",
234+
" outputCol='features')\n",
235+
"\n",
236+
"px = PolynomialExpansion(\n",
237+
" degree=2, \n",
238+
" inputCol=\"features\", \n",
239+
" outputCol=\"Polyn\")\n",
240+
"\n",
241+
"bd6 = px.transform(assembler.transform(bd5))\n",
242+
"\n",
243+
"bd6.select('DEP_DELAY','DISTANCE','Polyn').head(5)"
244+
]
245+
},
246+
{
247+
"cell_type": "markdown",
248+
"metadata": {},
249+
"source": [
250+
"## Estandarización de las variables"
251+
]
252+
},
253+
{
254+
"cell_type": "code",
255+
"execution_count": 8,
256+
"metadata": {
257+
"collapsed": false
258+
},
259+
"outputs": [
260+
{
261+
"name": "stdout",
262+
"output_type": "stream",
263+
"text": [
264+
"+-------------+--------------------+\n",
265+
"| features| stdfeatures|\n",
266+
"+-------------+--------------------+\n",
267+
"| [-8.0,236.0]|[-0.5061531206197...|\n",
268+
"| [6.0,236.0]|[-0.2251841350618...|\n",
269+
"| [-5.0,236.0]|[-0.4459454808573...|\n",
270+
"| [-6.0,236.0]|[-0.4660146941114...|\n",
271+
"| [-5.0,651.0]|[-0.4459454808573...|\n",
272+
"| [-5.0,370.0]|[-0.4459454808573...|\n",
273+
"| [-8.0,868.0]|[-0.5061531206197...|\n",
274+
"|[-6.0,1464.0]|[-0.4660146941114...|\n",
275+
"|[-3.0,1464.0]|[-0.4058070543490...|\n",
276+
"|[-6.0,1055.0]|[-0.4660146941114...|\n",
277+
"|[-11.0,255.0]|[-0.5663607603821...|\n",
278+
"| [0.0,1440.0]|[-0.3455994145866...|\n",
279+
"| [1.0,641.0]|[-0.3255302013325...|\n",
280+
"| [2.0,1440.0]|[-0.3054609880783...|\n",
281+
"|[-9.0,1055.0]|[-0.5262223338738...|\n",
282+
"| [5.0,1055.0]|[-0.2452533483159...|\n",
283+
"| [7.0,370.0]|[-0.2051149218077...|\n",
284+
"|[-4.0,1055.0]|[-0.4258762676032...|\n",
285+
"|[-2.0,1055.0]|[-0.3857378410949...|\n",
286+
"|[-7.0,1464.0]|[-0.4860839073656...|\n",
287+
"+-------------+--------------------+\n",
288+
"only showing top 20 rows\n",
289+
"\n"
290+
]
291+
}
292+
],
293+
"source": [
294+
"from pyspark.ml.feature import StandardScaler\n",
295+
"\n",
296+
"scaler = StandardScaler(inputCol=\"features\", outputCol=\"stdfeatures\",\n",
297+
" withStd=True, withMean=True)\n",
298+
"scalerModel = scaler.fit(bd6)\n",
299+
"bd6std = scalerModel.transform(bd6)\n",
300+
"\n",
301+
"bd6std.select('features','stdfeatures').show()"
302+
]
303+
},
304+
{
305+
"cell_type": "markdown",
306+
"metadata": {},
307+
"source": [
308+
"## Tranformación manual"
309+
]
310+
},
311+
{
312+
"cell_type": "code",
313+
"execution_count": 10,
314+
"metadata": {
315+
"collapsed": false
316+
},
317+
"outputs": [],
318+
"source": [
319+
"bd7 = bd6.withColumn('DepDelay2',(bd6.DEP_DELAY**2)\n",
320+
").withColumn('DepD_Distance',(bd6.DEP_DELAY * bd6.DISTANCE))"
321+
]
322+
},
323+
{
324+
"cell_type": "code",
325+
"execution_count": null,
326+
"metadata": {
327+
"collapsed": true
328+
},
329+
"outputs": [],
330+
"source": []
331+
}
332+
],
333+
"metadata": {
334+
"anaconda-cloud": {},
335+
"kernelspec": {
336+
"display_name": "Python [default]",
337+
"language": "python",
338+
"name": "python3"
339+
},
340+
"language_info": {
341+
"codemirror_mode": {
342+
"name": "ipython",
343+
"version": 3
344+
},
345+
"file_extension": ".py",
346+
"mimetype": "text/x-python",
347+
"name": "python",
348+
"nbconvert_exporter": "python",
349+
"pygments_lexer": "ipython3",
350+
"version": "3.5.2"
351+
}
352+
},
353+
"nbformat": 4,
354+
"nbformat_minor": 2
355+
}

0 commit comments

Comments
 (0)