Skip to content

Commit 72ba3c1

Browse files
Initial Commit
1 parent deeb11e commit 72ba3c1

File tree

5 files changed

+101
-0
lines changed

5 files changed

+101
-0
lines changed

02-HelloRDD/HelloRDD.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import sys
2+
3+
from pyspark import SparkConf, SparkContext
4+
from collections import namedtuple
5+
6+
from pyspark.sql import SparkSession
7+
8+
from lib.logger import Log4j
9+
10+
SurveyRecord = namedtuple("SurveyRecord", ["Age", "Gender", "Country", "State"])
11+
12+
if __name__ == "__main__":
13+
conf = SparkConf() \
14+
.setMaster("local[3]") \
15+
.setAppName("HelloRDD")
16+
17+
# sc = SparkContext(conf=conf)
18+
spark = SparkSession.builder.config(conf=conf).getOrCreate()
19+
sc = spark.sparkContext
20+
logger = Log4j(spark)
21+
22+
linesRDD = sc.textFile(sys.argv[1])
23+
partitionedRDD = linesRDD.repartition(2)
24+
25+
colsRDD = partitionedRDD.map(lambda line: line.replace('"', '').split(","))
26+
selectRDD = colsRDD.map(lambda cols: SurveyRecord(int(cols[1]), cols[2], cols[3], cols[4]))
27+
filteredRDD = selectRDD.filter(lambda r: r.Age < 40)
28+
kvRDD = filteredRDD.map(lambda r: (r.Country, 1))
29+
countRDD = kvRDD.reduceByKey(lambda v1, v2: v1 + v2)
30+
31+
colsList = countRDD.collect()
32+
for x in colsList:
33+
logger.info(x)

02-HelloRDD/data/sample.csv

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2014-08-27 11:29:31,37,"Female","United States","IL",NA,"No","Yes","Often","6-25","No","Yes","Yes","Not sure","No","Yes","Yes","Somewhat easy","No","No","Some of them","Yes","No","Maybe","Yes","No",NA
2+
2014-08-27 11:29:37,44,"M","United States","IN",NA,"No","No","Rarely","More than 1000","No","No","Don't know","No","Don't know","Don't know","Don't know","Don't know","Maybe","No","No","No","No","No","Don't know","No",NA
3+
2014-08-27 11:29:44,32,"Male","Canada",NA,NA,"No","No","Rarely","6-25","No","Yes","No","No","No","No","Don't know","Somewhat difficult","No","No","Yes","Yes","Yes","Yes","No","No",NA
4+
2014-08-27 11:29:46,31,"Male","United Kingdom",NA,NA,"Yes","Yes","Often","26-100","No","Yes","No","Yes","No","No","No","Somewhat difficult","Yes","Yes","Some of them","No","Maybe","Maybe","No","Yes",NA
5+
2014-08-27 11:30:22,31,"Male","United States","TX",NA,"No","No","Never","100-500","Yes","Yes","Yes","No","Don't know","Don't know","Don't know","Don't know","No","No","Some of them","Yes","Yes","Yes","Don't know","No",NA
6+
2014-08-27 11:31:22,33,"Male","United States","TN",NA,"Yes","No","Sometimes","6-25","No","Yes","Yes","Not sure","No","Don't know","Don't know","Don't know","No","No","Yes","Yes","No","Maybe","Don't know","No",NA
7+
2014-08-27 11:31:50,35,"Female","United States","MI",NA,"Yes","Yes","Sometimes","1-5","Yes","Yes","No","No","No","No","No","Somewhat difficult","Maybe","Maybe","Some of them","No","No","No","Don't know","No",NA
8+
2014-08-27 11:32:05,39,"M","Canada",NA,NA,"No","No","Never","1-5","Yes","Yes","No","Yes","No","No","Yes","Don't know","No","No","No","No","No","No","No","No",NA
9+
2014-08-27 11:32:39,42,"Female","United States","IL",NA,"Yes","Yes","Sometimes","100-500","No","Yes","Yes","Yes","No","No","No","Very difficult","Maybe","No","Yes","Yes","No","Maybe","No","No",NA

02-HelloRDD/lib/__init__.py

Whitespace-only changes.

02-HelloRDD/lib/logger.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
class Log4j:
2+
def __init__(self, spark):
3+
log4j = spark._jvm.org.apache.log4j
4+
5+
root_class = "guru.learningjournal.spark.examples"
6+
conf = spark.sparkContext.getConf()
7+
app_name = conf.get("spark.app.name")
8+
9+
self.logger = log4j.LogManager.getLogger(root_class + "." + app_name)
10+
11+
def warn(self, message):
12+
self.logger.warn(message)
13+
14+
def info(self, message):
15+
self.logger.info(message)
16+
17+
def error(self, message):
18+
self.logger.error(message)
19+
20+
def debug(self, message):
21+
self.logger.debug(message)

02-HelloRDD/log4j.properties

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Set everything to be logged to the console
2+
log4j.rootCategory=WARN, console
3+
4+
# define console appender
5+
log4j.appender.console=org.apache.log4j.ConsoleAppender
6+
log4j.appender.console.target=System.out
7+
log4j.appender.console.layout=org.apache.log4j.PatternLayout
8+
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
9+
10+
#application log
11+
log4j.logger.guru.learningjournal.spark.examples=INFO, console, file
12+
log4j.additivity.guru.learningjournal.spark.examples=false
13+
14+
#define rolling file appender
15+
log4j.appender.file=org.apache.log4j.RollingFileAppender
16+
log4j.appender.file.File=${spark.yarn.app.container.log.dir}/${logfile.name}.log
17+
#define following in Java System
18+
# -Dlog4j.configuration=file:log4j.properties
19+
# -Dlogfile.name=hello-spark
20+
# -Dspark.yarn.app.container.log.dir=app-logs
21+
log4j.appender.file.ImmediateFlush=true
22+
log4j.appender.file.Append=false
23+
log4j.appender.file.MaxFileSize=500MB
24+
log4j.appender.file.MaxBackupIndex=2
25+
log4j.appender.file.layout=org.apache.log4j.PatternLayout
26+
log4j.appender.file.layout.conversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
27+
28+
# Recommendations from Spark template
29+
log4j.logger.org.apache.spark.repl.Main=WARN
30+
log4j.logger.org.spark_project.jetty=WARN
31+
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
32+
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
33+
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
34+
log4j.logger.org.apache.parquet=ERROR
35+
log4j.logger.parquet=ERROR
36+
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
37+
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
38+

0 commit comments

Comments
 (0)