Skip to content

Commit f87db8b

Browse files
authored
Importing the project
0 parents  commit f87db8b

File tree

4 files changed

+1643
-0
lines changed

4 files changed

+1643
-0
lines changed

SparkML.ipynb

Lines changed: 1013 additions & 0 deletions
Large diffs are not rendered by default.

Tweet.ipynb

Lines changed: 372 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,372 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"<h3>Using Twitter API to gather tweets based on speciifc hashtags and its sentiment"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 2,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"from textblob import TextBlob\n",
17+
"from twython import Twython\n",
18+
"import json\n",
19+
"import pandas as pd\n",
20+
"import re\n",
21+
"\n",
22+
"def clean_tweet(tweet):\n",
23+
" return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\", \" \", tweet).split())\n",
24+
"\n",
25+
"def sentiment(tweet):\n",
26+
" ''' \n",
27+
" Utility function to classify sentiment of passed tweet \n",
28+
" using textblob's sentiment method \n",
29+
" '''\n",
30+
" # create TextBlob object of passed tweet text \n",
31+
" analysis = TextBlob(clean_tweet(tweet)) \n",
32+
" # set sentiment \n",
33+
" if analysis.sentiment.polarity > 0: \n",
34+
" return 1\n",
35+
" #elif analysis.sentiment.polarity == 0: \n",
36+
" #return 'neutral' \n",
37+
" return 0"
38+
]
39+
},
40+
{
41+
"cell_type": "markdown",
42+
"metadata": {},
43+
"source": [
44+
"### Note: You cannot get tweets unless you have access to `Twitter API`\n",
45+
"<h3>Check the format below in order to use your credentials"
46+
]
47+
},
48+
{
49+
"cell_type": "raw",
50+
"metadata": {},
51+
"source": [
52+
"{\n",
53+
" \"consumer_key\" : \"\",\n",
54+
" \"consumer_secret\" : \"\",\n",
55+
" \"access_token\" : \"\",\n",
56+
" \"access_token_secret\" : \"\"\n",
57+
"}"
58+
]
59+
},
60+
{
61+
"cell_type": "markdown",
62+
"metadata": {},
63+
"source": [
64+
"#### Define your credentials in `twitter_credentials.json` file based on the keys you get from API <u>BEFORE EXECUTING THIS PART</u>"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": 3,
70+
"metadata": {},
71+
"outputs": [],
72+
"source": [
73+
"# Load credentials from json file\n",
74+
"with open(\"twitter_credentials.json\", \"r\") as file:\n",
75+
" creds = json.load(file)\n",
76+
"\n",
77+
"# Instantiate an object\n",
78+
"python_tweets = Twython(creds['consumer_key'], creds['consumer_secret']) # ,creds['access_token'],creds['access_token_secret']\n",
79+
"# ['airpods','iphone12','MacBookPro16']"
80+
]
81+
},
82+
{
83+
"cell_type": "markdown",
84+
"metadata": {},
85+
"source": [
86+
"#### Customizing queries with hashtags, timestamp, language and quantity\n",
87+
"In this project the purpose was to get a fair quantitity of each `label` (positive and negative meaning tweets) and `hashtags` (#airpods, #iphone12, #macbookpro16)"
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": 7,
93+
"metadata": {},
94+
"outputs": [
95+
{
96+
"data": {
97+
"text/html": [
98+
"<div>\n",
99+
"<style scoped>\n",
100+
" .dataframe tbody tr th:only-of-type {\n",
101+
" vertical-align: middle;\n",
102+
" }\n",
103+
"\n",
104+
" .dataframe tbody tr th {\n",
105+
" vertical-align: top;\n",
106+
" }\n",
107+
"\n",
108+
" .dataframe thead th {\n",
109+
" text-align: right;\n",
110+
" }\n",
111+
"</style>\n",
112+
"<table border=\"1\" class=\"dataframe\">\n",
113+
" <thead>\n",
114+
" <tr style=\"text-align: right;\">\n",
115+
" <th></th>\n",
116+
" <th>id</th>\n",
117+
" <th>text</th>\n",
118+
" <th>label</th>\n",
119+
" </tr>\n",
120+
" </thead>\n",
121+
" <tbody>\n",
122+
" <tr>\n",
123+
" <th>0</th>\n",
124+
" <td>1244295516927688712</td>\n",
125+
" <td>@SAfmnews The tweets in here are full of spark...</td>\n",
126+
" <td>1</td>\n",
127+
" </tr>\n",
128+
" <tr>\n",
129+
" <th>1</th>\n",
130+
" <td>1244171436907999232</td>\n",
131+
" <td>@CHINWENDUH @Ikelectron @oblomart @I_amDozie H...</td>\n",
132+
" <td>1</td>\n",
133+
" </tr>\n",
134+
" <tr>\n",
135+
" <th>2</th>\n",
136+
" <td>1244043940736704517</td>\n",
137+
" <td>@JuliaHB1 l am old,but I understand that this ...</td>\n",
138+
" <td>1</td>\n",
139+
" </tr>\n",
140+
" <tr>\n",
141+
" <th>3</th>\n",
142+
" <td>1243605527663448068</td>\n",
143+
" <td>@pril_98 @TwoDaeFourEight @lifelessmachine @sp...</td>\n",
144+
" <td>1</td>\n",
145+
" </tr>\n",
146+
" <tr>\n",
147+
" <th>4</th>\n",
148+
" <td>1243259668174008321</td>\n",
149+
" <td>someone is already 21 yrs old but can't even m...</td>\n",
150+
" <td>1</td>\n",
151+
" </tr>\n",
152+
" <tr>\n",
153+
" <th>5</th>\n",
154+
" <td>1242642040103424000</td>\n",
155+
" <td>With kiwis using their devices more &amp;amp; scam...</td>\n",
156+
" <td>1</td>\n",
157+
" </tr>\n",
158+
" <tr>\n",
159+
" <th>6</th>\n",
160+
" <td>1242567113262256128</td>\n",
161+
" <td>@digiwonk @nora3000 Hi big fan of @spark and @...</td>\n",
162+
" <td>1</td>\n",
163+
" </tr>\n",
164+
" <tr>\n",
165+
" <th>7</th>\n",
166+
" <td>1242452796659523584</td>\n",
167+
" <td>Let there be light, let there be light\\nI spar...</td>\n",
168+
" <td>1</td>\n",
169+
" </tr>\n",
170+
" <tr>\n",
171+
" <th>8</th>\n",
172+
" <td>1242189629819887616</td>\n",
173+
" <td>@Issamoodi @weakintheheart @trishapaytas While...</td>\n",
174+
" <td>1</td>\n",
175+
" </tr>\n",
176+
" </tbody>\n",
177+
"</table>\n",
178+
"</div>"
179+
],
180+
"text/plain": [
181+
" id text \\\n",
182+
"0 1244295516927688712 @SAfmnews The tweets in here are full of spark... \n",
183+
"1 1244171436907999232 @CHINWENDUH @Ikelectron @oblomart @I_amDozie H... \n",
184+
"2 1244043940736704517 @JuliaHB1 l am old,but I understand that this ... \n",
185+
"3 1243605527663448068 @pril_98 @TwoDaeFourEight @lifelessmachine @sp... \n",
186+
"4 1243259668174008321 someone is already 21 yrs old but can't even m... \n",
187+
"5 1242642040103424000 With kiwis using their devices more &amp; scam... \n",
188+
"6 1242567113262256128 @digiwonk @nora3000 Hi big fan of @spark and @... \n",
189+
"7 1242452796659523584 Let there be light, let there be light\\nI spar... \n",
190+
"8 1242189629819887616 @Issamoodi @weakintheheart @trishapaytas While... \n",
191+
"\n",
192+
" label \n",
193+
"0 1 \n",
194+
"1 1 \n",
195+
"2 1 \n",
196+
"3 1 \n",
197+
"4 1 \n",
198+
"5 1 \n",
199+
"6 1 \n",
200+
"7 1 \n",
201+
"8 1 "
202+
]
203+
},
204+
"execution_count": 7,
205+
"metadata": {},
206+
"output_type": "execute_result"
207+
}
208+
],
209+
"source": [
210+
"#Twitter is queried\n",
211+
"response = python_tweets.search(q='airpods', since = \"2016-12-13\", count=5000, lang='en')\n",
212+
"\n",
213+
"#Results are printed\n",
214+
"#print(json.dumps(response, sort_keys = True, indent = 2))\n",
215+
"\n",
216+
"# Search tweets\n",
217+
"dict_ = {'id': [], 'text': [], 'label':[]}\n",
218+
"p = 0 # positive meaning tweets\n",
219+
"n = 0 # negative meaning tweets\n",
220+
"for status in response['statuses']:\n",
221+
" if (not status['retweeted']) and ('RT @' not in status['text']):\n",
222+
" if n == 14 and p == 14:\n",
223+
" break\n",
224+
" if sentiment(status['text']) and p < 14:\n",
225+
" dict_['id'].append(status['id'])\n",
226+
" dict_['text'].append(status['text'])\n",
227+
" dict_['label'].append(1)\n",
228+
" p+=1\n",
229+
" elif sentiment(status['text']) and n < 14:\n",
230+
" dict_['id'].append(status['id'])\n",
231+
" dict_['text'].append(status['text'])\n",
232+
" dict_['label'].append(0)\n",
233+
" n+=1\n",
234+
"\n",
235+
"# Structure data in a pandas DataFrame for easier manipulation\n",
236+
"df = pd.DataFrame(dict_)\n",
237+
"#df.sort_values(by='favorite_count', inplace=True, ascending=False)"
238+
]
239+
},
240+
{
241+
"cell_type": "code",
242+
"execution_count": 4,
243+
"metadata": {},
244+
"outputs": [],
245+
"source": [
246+
"#Twitter is queried\n",
247+
"response = python_tweets.search(q='iphone12', since = \"2019-09-20\", count=2000, lang='en')\n",
248+
"\n",
249+
"#Results are printed\n",
250+
"#print(json.dumps(response, sort_keys = True, indent = 2))\n",
251+
"\n",
252+
"# Search tweets\n",
253+
"p = 0 # positive meaning tweets\n",
254+
"n = 0 # negative meaning tweets\n",
255+
"for status in response['statuses']:\n",
256+
" if (not status['retweeted']) and ('RT @' not in status['text']):\n",
257+
" if n == 8 and p == 8:\n",
258+
" break\n",
259+
" if sentiment(status['text']) and p < 8:\n",
260+
" dict_['id'].append(status['id'])\n",
261+
" dict_['text'].append(status['text'])\n",
262+
" dict_['label'].append(1)\n",
263+
" p+=1\n",
264+
" elif sentiment(status['text']) and n < 8:\n",
265+
" dict_['id'].append(status['id'])\n",
266+
" dict_['text'].append(status['text'])\n",
267+
" dict_['label'].append(0)\n",
268+
" n+=1\n",
269+
"\n",
270+
"# Structure data in a pandas DataFrame for easier manipulation\n",
271+
"df = pd.DataFrame(dict_)\n",
272+
"#df.sort_values(by='favorite_count', inplace=True, ascending=False)"
273+
]
274+
},
275+
{
276+
"cell_type": "code",
277+
"execution_count": 5,
278+
"metadata": {},
279+
"outputs": [],
280+
"source": [
281+
"#Twitter is queried\n",
282+
"response = python_tweets.search(q='macbookpro16', since = \"2013-06-05\", count=100000, lang='en')\n",
283+
"\n",
284+
"#Results are printed\n",
285+
"#print(json.dumps(response, sort_keys = True, indent = 2))\n",
286+
"\n",
287+
"# Search tweets\n",
288+
"p = 0 # positive meaning tweets\n",
289+
"n = 0 # negative meaning tweets\n",
290+
"for status in response['statuses']:\n",
291+
" if (not status['retweeted']) and ('RT @' not in status['text']):\n",
292+
" if n == 3 and p == 3:\n",
293+
" break\n",
294+
" if sentiment(status['text']) and p < 3:\n",
295+
" dict_['id'].append(status['id'])\n",
296+
" dict_['text'].append(status['text'])\n",
297+
" dict_['label'].append(1)\n",
298+
" p+=1\n",
299+
" elif sentiment(status['text']) and n < 3:\n",
300+
" dict_['id'].append(status['id'])\n",
301+
" dict_['text'].append(status['text'])\n",
302+
" dict_['label'].append(0)\n",
303+
" n+=1\n",
304+
"\n",
305+
"# Structure data in a pandas DataFrame for easier manipulation\n",
306+
"df = pd.DataFrame(dict_)\n",
307+
"#df.sort_values(by='favorite_count', inplace=True, ascending=False)"
308+
]
309+
},
310+
{
311+
"cell_type": "code",
312+
"execution_count": 7,
313+
"metadata": {},
314+
"outputs": [
315+
{
316+
"name": "stdout",
317+
"output_type": "stream",
318+
"text": [
319+
"<class 'pandas.core.frame.DataFrame'>\n",
320+
"RangeIndex: 50 entries, 0 to 49\n",
321+
"Data columns (total 3 columns):\n",
322+
"id 50 non-null int64\n",
323+
"text 50 non-null object\n",
324+
"label 50 non-null int64\n",
325+
"dtypes: int64(2), object(1)\n",
326+
"memory usage: 1.3+ KB\n"
327+
]
328+
}
329+
],
330+
"source": [
331+
"df.info()"
332+
]
333+
},
334+
{
335+
"cell_type": "markdown",
336+
"metadata": {},
337+
"source": [
338+
"#### Export your dataframe as a `json` file"
339+
]
340+
},
341+
{
342+
"cell_type": "code",
343+
"execution_count": 6,
344+
"metadata": {},
345+
"outputs": [],
346+
"source": [
347+
"df.to_json (r'piratacodex.json',orient='records')"
348+
]
349+
}
350+
],
351+
"metadata": {
352+
"kernelspec": {
353+
"display_name": "Python 3",
354+
"language": "python",
355+
"name": "python3"
356+
},
357+
"language_info": {
358+
"codemirror_mode": {
359+
"name": "ipython",
360+
"version": 3
361+
},
362+
"file_extension": ".py",
363+
"mimetype": "text/x-python",
364+
"name": "python",
365+
"nbconvert_exporter": "python",
366+
"pygments_lexer": "ipython3",
367+
"version": "3.7.6"
368+
}
369+
},
370+
"nbformat": 4,
371+
"nbformat_minor": 4
372+
}

0 commit comments

Comments
 (0)