Pirata-Codex
diff --git a/‎SparkML.ipynb
Lines changed: 1013 additions & 0 deletions b/‎SparkML.ipynb
Lines changed: 1013 additions & 0 deletions
diff --git a/‎Tweet.ipynb
Lines changed: 372 additions & 0 deletions b/‎Tweet.ipynb
Lines changed: 372 additions & 0 deletions
@@ -0,0 +1,372 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h3>Using Twitter API to gather tweets based on speciifc hashtags and its sentiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from textblob import TextBlob\n",
+    "from twython import Twython\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "\n",
+    "def clean_tweet(tweet):\n",
+    "    return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\", \" \", tweet).split())\n",
+    "\n",
+    "def sentiment(tweet):\n",
+    "    ''' \n",
+    "    Utility function to classify sentiment of passed tweet \n",
+    "    using textblob's sentiment method \n",
+    "    '''\n",
+    "    # create TextBlob object of passed tweet text \n",
+    "    analysis = TextBlob(clean_tweet(tweet)) \n",
+    "    # set sentiment \n",
+    "    if analysis.sentiment.polarity > 0: \n",
+    "        return 1\n",
+    "    #elif analysis.sentiment.polarity == 0: \n",
+    "        #return 'neutral'    \n",
+    "    return 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Note: You cannot get tweets unless you have access to `Twitter API`\n",
+    "<h3>Check the format below in order to use your credentials"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "{\n",
+    "    \"consumer_key\" : \"\",\n",
+    "    \"consumer_secret\" : \"\",\n",
+    "    \"access_token\" : \"\",\n",
+    "    \"access_token_secret\" : \"\"\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Define your credentials in `twitter_credentials.json` file based on the keys you get from API <u>BEFORE EXECUTING THIS PART</u>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load credentials from json file\n",
+    "with open(\"twitter_credentials.json\", \"r\") as file:\n",
+    "    creds = json.load(file)\n",
+    "\n",
+    "# Instantiate an object\n",
+    "python_tweets = Twython(creds['consumer_key'], creds['consumer_secret']) # ,creds['access_token'],creds['access_token_secret']\n",
+    "# ['airpods','iphone12','MacBookPro16']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Customizing queries with hashtags, timestamp, language and quantity\n",
+    "In this project the purpose was to get a fair quantitity of each `label` (positive and negative meaning tweets) and `hashtags` (#airpods, #iphone12, #macbookpro16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>text</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1244295516927688712</td>\n",
+       "      <td>@SAfmnews The tweets in here are full of spark...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1244171436907999232</td>\n",
+       "      <td>@CHINWENDUH @Ikelectron @oblomart @I_amDozie H...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1244043940736704517</td>\n",
+       "      <td>@JuliaHB1 l am old,but I understand that this ...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1243605527663448068</td>\n",
+       "      <td>@pril_98 @TwoDaeFourEight @lifelessmachine @sp...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1243259668174008321</td>\n",
+       "      <td>someone is already 21 yrs old but can't even m...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>1242642040103424000</td>\n",
+       "      <td>With kiwis using their devices more &amp;amp; scam...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>1242567113262256128</td>\n",
+       "      <td>@digiwonk @nora3000 Hi big fan of @spark and @...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>1242452796659523584</td>\n",
+       "      <td>Let there be light, let there be light\\nI spar...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>1242189629819887616</td>\n",
+       "      <td>@Issamoodi @weakintheheart @trishapaytas While...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    id                                               text  \\\n",
+       "0  1244295516927688712  @SAfmnews The tweets in here are full of spark...   \n",
+       "1  1244171436907999232  @CHINWENDUH @Ikelectron @oblomart @I_amDozie H...   \n",
+       "2  1244043940736704517  @JuliaHB1 l am old,but I understand that this ...   \n",
+       "3  1243605527663448068  @pril_98 @TwoDaeFourEight @lifelessmachine @sp...   \n",
+       "4  1243259668174008321  someone is already 21 yrs old but can't even m...   \n",
+       "5  1242642040103424000  With kiwis using their devices more &amp; scam...   \n",
+       "6  1242567113262256128  @digiwonk @nora3000 Hi big fan of @spark and @...   \n",
+       "7  1242452796659523584  Let there be light, let there be light\\nI spar...   \n",
+       "8  1242189629819887616  @Issamoodi @weakintheheart @trishapaytas While...   \n",
+       "\n",
+       "   label  \n",
+       "0      1  \n",
+       "1      1  \n",
+       "2      1  \n",
+       "3      1  \n",
+       "4      1  \n",
+       "5      1  \n",
+       "6      1  \n",
+       "7      1  \n",
+       "8      1  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Twitter is queried\n",
+    "response = python_tweets.search(q='airpods', since = \"2016-12-13\", count=5000, lang='en')\n",
+    "\n",
+    "#Results are printed\n",
+    "#print(json.dumps(response, sort_keys = True, indent = 2))\n",
+    "\n",
+    "# Search tweets\n",
+    "dict_ = {'id': [], 'text': [], 'label':[]}\n",
+    "p = 0 # positive meaning tweets\n",
+    "n = 0 # negative meaning tweets\n",
+    "for status in response['statuses']:\n",
+    "    if (not status['retweeted']) and ('RT @' not in status['text']):\n",
+    "        if n == 14 and p == 14:\n",
+    "            break\n",
+    "        if sentiment(status['text']) and p < 14:\n",
+    "            dict_['id'].append(status['id'])\n",
+    "            dict_['text'].append(status['text'])\n",
+    "            dict_['label'].append(1)\n",
+    "            p+=1\n",
+    "        elif sentiment(status['text']) and n < 14:\n",
+    "            dict_['id'].append(status['id'])\n",
+    "            dict_['text'].append(status['text'])\n",
+    "            dict_['label'].append(0)\n",
+    "            n+=1\n",
+    "\n",
+    "# Structure data in a pandas DataFrame for easier manipulation\n",
+    "df = pd.DataFrame(dict_)\n",
+    "#df.sort_values(by='favorite_count', inplace=True, ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Twitter is queried\n",
+    "response = python_tweets.search(q='iphone12', since = \"2019-09-20\", count=2000, lang='en')\n",
+    "\n",
+    "#Results are printed\n",
+    "#print(json.dumps(response, sort_keys = True, indent = 2))\n",
+    "\n",
+    "# Search tweets\n",
+    "p = 0 # positive meaning tweets\n",
+    "n = 0 # negative meaning tweets\n",
+    "for status in response['statuses']:\n",
+    "    if (not status['retweeted']) and ('RT @' not in status['text']):\n",
+    "        if n == 8 and p == 8:\n",
+    "            break\n",
+    "        if sentiment(status['text']) and p < 8:\n",
+    "            dict_['id'].append(status['id'])\n",
+    "            dict_['text'].append(status['text'])\n",
+    "            dict_['label'].append(1)\n",
+    "            p+=1\n",
+    "        elif sentiment(status['text']) and n < 8:\n",
+    "            dict_['id'].append(status['id'])\n",
+    "            dict_['text'].append(status['text'])\n",
+    "            dict_['label'].append(0)\n",
+    "            n+=1\n",
+    "\n",
+    "# Structure data in a pandas DataFrame for easier manipulation\n",
+    "df = pd.DataFrame(dict_)\n",
+    "#df.sort_values(by='favorite_count', inplace=True, ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Twitter is queried\n",
+    "response = python_tweets.search(q='macbookpro16', since = \"2013-06-05\", count=100000, lang='en')\n",
+    "\n",
+    "#Results are printed\n",
+    "#print(json.dumps(response, sort_keys = True, indent = 2))\n",
+    "\n",
+    "# Search tweets\n",
+    "p = 0 # positive meaning tweets\n",
+    "n = 0 # negative meaning tweets\n",
+    "for status in response['statuses']:\n",
+    "    if (not status['retweeted']) and ('RT @' not in status['text']):\n",
+    "        if n == 3 and p == 3:\n",
+    "            break\n",
+    "        if sentiment(status['text']) and p < 3:\n",
+    "            dict_['id'].append(status['id'])\n",
+    "            dict_['text'].append(status['text'])\n",
+    "            dict_['label'].append(1)\n",
+    "            p+=1\n",
+    "        elif sentiment(status['text']) and n < 3:\n",
+    "            dict_['id'].append(status['id'])\n",
+    "            dict_['text'].append(status['text'])\n",
+    "            dict_['label'].append(0)\n",
+    "            n+=1\n",
+    "\n",
+    "# Structure data in a pandas DataFrame for easier manipulation\n",
+    "df = pd.DataFrame(dict_)\n",
+    "#df.sort_values(by='favorite_count', inplace=True, ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 50 entries, 0 to 49\n",
+      "Data columns (total 3 columns):\n",
+      "id       50 non-null int64\n",
+      "text     50 non-null object\n",
+      "label    50 non-null int64\n",
+      "dtypes: int64(2), object(1)\n",
+      "memory usage: 1.3+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Export your dataframe as a `json` file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_json (r'piratacodex.json',orient='records')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}