{ "cells": [ { "cell_type": "markdown", "id": "ece502db-27d3-4a24-bb40-5dccf3d42f4c", "metadata": {}, "source": [ "# Logistic Regression" ] }, { "cell_type": "markdown", "id": "6f271d7e-29db-41ba-a4e3-80e541be8a32", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "id": "9a1df8b8-16b2-43b3-817d-67cf5d0438cb", "metadata": {}, "outputs": [], "source": [ "from fastcore.all import *\n", "import numpy as np\n", "import scipy as sp\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import random\n", "import nltk\n", "import re\n", "import string\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn import metrics\n", "from nltk.corpus import twitter_samples\n", "from rich.console import Console\n", "from nltk.corpus import stopwords # module for stop words that come with NLTK\n", "from nltk.stem import PorterStemmer # module for stemming\n", "from nltk.tokenize import TweetTokenizer # module for tokenizing strings" ] }, { "cell_type": "code", "execution_count": 2, "id": "2ede2468-2762-412a-a95f-6fa295d56f0d", "metadata": {}, "outputs": [], "source": [ "sns.set()\n", "console = Console()" ] }, { "cell_type": "markdown", "id": "6c04a079-0aac-4327-bea7-311570740015", "metadata": {}, "source": [ "## Download Dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "e7665ef5-3d13-453d-a240-51e994a78ccf", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package twitter_samples to\n", "[nltk_data] /home/rahul.saraf/nltk_data...\n", "[nltk_data] Package twitter_samples is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /home/rahul.saraf/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download('twitter_samples')\n", "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": 4, "id": "7e04909e-8aa3-4eef-813b-1bfdf3a9123a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'contributors': None,\n", " 'coordinates': None,\n", " 'text': 'hopeless for tmr :(',\n", " 'user': {'screen_name': 'yuwraxkim',\n", " 'time_zone': 'Jakarta',\n", " 'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/585476378365014016/j1mvQu3c.png',\n", " 'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/585476378365014016/j1mvQu3c.png',\n", " 'default_profile_image': False,\n", " 'url': None,\n", " 'profile_text_color': '000000',\n", " 'following': False,\n", " 'listed_count': 3,\n", " 'entities': {'description': {'urls': []}},\n", " 'utc_offset': 25200,\n", " 'profile_sidebar_border_color': '000000',\n", " 'name': 'yuwra ✈ ',\n", " 'favourites_count': 196,\n", " 'followers_count': 1281,\n", " 'location': 'wearegsd;favor;pucukfams;barbx',\n", " 'protected': False,\n", " 'notifications': False,\n", " 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/622631732399898624/kmYsX_k1_normal.jpg',\n", " 'profile_use_background_image': True,\n", " 'profile_image_url': 'http://pbs.twimg.com/profile_images/622631732399898624/kmYsX_k1_normal.jpg',\n", " 'lang': 'id',\n", " 'statuses_count': 19710,\n", " 'friends_count': 1264,\n", " 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/3078803375/1433287528',\n", " 'geo_enabled': True,\n", " 'is_translator': False,\n", " 'contributors_enabled': False,\n", " 'profile_sidebar_fill_color': '000000',\n", " 'created_at': 'Sun Mar 08 05:43:40 +0000 2015',\n", " 'verified': False,\n", " 'profile_link_color': '000000',\n", " 'is_translation_enabled': False,\n", " 'has_extended_profile': False,\n", " 'id_str': '3078803375',\n", " 'follow_request_sent': False,\n", " 'profile_background_color': '000000',\n", " 'default_profile': False,\n", " 'profile_background_tile': True,\n", " 'id': 3078803375,\n", " 'description': '⇨ [V] TravelGency █ 2/4 Goddest from Girls Day █ 92L █ sucrp'},\n", " 'retweet_count': 0,\n", " 'favorited': False,\n", " 'entities': {'hashtags': [], 'user_mentions': [], 'urls': [], 'symbols': []},\n", " 'source': '<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Mobile Web (M2)</a>',\n", " 'truncated': False,\n", " 'geo': None,\n", " 'in_reply_to_status_id_str': None,\n", " 'is_quote_status': False,\n", " 'in_reply_to_user_id_str': None,\n", " 'place': None,\n", " 'in_reply_to_status_id': None,\n", " 'in_reply_to_screen_name': None,\n", " 'lang': 'en',\n", " 'retweeted': False,\n", " 'in_reply_to_user_id': None,\n", " 'created_at': 'Fri Jul 24 10:42:49 +0000 2015',\n", " 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},\n", " 'favorite_count': 0,\n", " 'id_str': '624530164626534400',\n", " 'id': 624530164626534400}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "L(twitter_samples.docs())[0]" ] }, { "cell_type": "code", "execution_count": 5, "id": "be8f9dd8-b382-4423-8b84-99d8059a7bfb", "metadata": {}, "outputs": [], "source": [ "ptweets = twitter_samples.strings('positive_tweets.json')\n", "ntweets = twitter_samples.strings('negative_tweets.json')" ] }, { "cell_type": "code", "execution_count": 6, "id": "a7df9941-17f4-4214-8b6f-aacbd040bccf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5000, 5000)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(ptweets), len(ntweets)" ] }, { "cell_type": "code", "execution_count": 7, "id": "c724e814-4681-4d33-be40-30eadf3a4bbc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000\">Hi BAM ! @BarsAndMelody </span>\n", "<span style=\"color: #008000; text-decoration-color: #008000\">Can you follow my bestfriend @969Horan696 ? </span>\n", "<span style=\"color: #008000; text-decoration-color: #008000\">She loves you a lot :</span><span style=\"color: #008000; text-decoration-color: #008000; font-weight: bold\">)</span><span style=\"color: #008000; text-decoration-color: #008000\"> </span>\n", "<span style=\"color: #008000; text-decoration-color: #008000\">See you in Warsaw &lt;</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span><span style=\"color: #008000; text-decoration-color: #008000\"> </span>\n", "<span style=\"color: #008000; text-decoration-color: #008000\">Love you &lt;</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span><span style=\"color: #008000; text-decoration-color: #008000\"> x23</span>\n", "</pre>\n" ], "text/plain": [ "\u001b[32mHi BAM ! @BarsAndMelody \u001b[0m\n", "\u001b[32mCan you follow my bestfriend @969Horan696 ? \u001b[0m\n", "\u001b[32mShe loves you a lot :\u001b[0m\u001b[1;32m)\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mSee you in Warsaw <\u001b[0m\u001b[1;36m3\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mLove you <\u001b[0m\u001b[1;36m3\u001b[0m\u001b[32m x23\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">Mtaani tunaita pussy viazi choma and we still get laid :-</span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">(</span>\n", "</pre>\n" ], "text/plain": [ "\u001b[31mMtaani tunaita pussy viazi choma and we still get laid :-\u001b[0m\u001b[1;31m(\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "console.print(ptweets[random.randint(0,5000)], style='green')\n", "console.print(ntweets[random.randint(0,5000)], style='red')" ] }, { "cell_type": "markdown", "id": "611ea39a-8c60-4543-8a5d-5ab1ce384fd5", "metadata": {}, "source": [ "## Preprocessing" ] }, { "cell_type": "markdown", "id": "c39c94f0-7168-456b-b721-7e85cffdfa93", "metadata": {}, "source": [ "```{admonition} What are we going to do?\n", "1. Remove hyperlinks, twitter marks and styles\n", "2. Tokenize\n", "3. Remove Stopwords\n", "4. Stemming\n", "```" ] }, { "cell_type": "markdown", "id": "dedeeaf4-7a23-4a7c-9272-6c385ef18f97", "metadata": {}, "source": [ "### Remove hyperlinks, twitter marks and styles" ] }, { "cell_type": "code", "execution_count": 8, "id": "065f0a82-93bf-4ae8-b6ff-9cc869a41788", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>class</th>\n", " <th>Tweet</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>positive</td>\n", " <td>#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>positive</td>\n", " <td>@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>positive</td>\n", " <td>@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>positive</td>\n", " <td>@97sides CONGRATS :)</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>positive</td>\n", " <td>yeaaaah yippppy!!! my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>9995</th>\n", " <td>negative</td>\n", " <td>I wanna change my avi but uSanele :(</td>\n", " </tr>\n", " <tr>\n", " <th>9996</th>\n", " <td>negative</td>\n", " <td>MY PUPPY BROKE HER FOOT :(</td>\n", " </tr>\n", " <tr>\n", " <th>9997</th>\n", " <td>negative</td>\n", " <td>where's all the jaebum baby pictures :((</td>\n", " </tr>\n", " <tr>\n", " <th>9998</th>\n", " <td>negative</td>\n", " <td>But but Mr Ahmad Maslan cooks too :( https://t.co/ArCiD31Zv6</td>\n", " </tr>\n", " <tr>\n", " <th>9999</th>\n", " <td>negative</td>\n", " <td>@eawoman As a Hull supporter I am expecting a misserable few weeks :-(</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>10000 rows × 2 columns</p>\n", "</div>" ], "text/plain": [ " class \\\n", "0 positive \n", "1 positive \n", "2 positive \n", "3 positive \n", "4 positive \n", "... ... \n", "9995 negative \n", "9996 negative \n", "9997 negative \n", "9998 negative \n", "9999 negative \n", "\n", " Tweet \n", "0 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :) \n", "1 @Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks! \n", "2 @DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?! \n", "3 @97sides CONGRATS :) \n", "4 yeaaaah yippppy!!! my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days \n", "... ... \n", "9995 I wanna change my avi but uSanele :( \n", "9996 MY PUPPY BROKE HER FOOT :( \n", "9997 where's all the jaebum baby pictures :(( \n", "9998 But but Mr Ahmad Maslan cooks too :( https://t.co/ArCiD31Zv6 \n", "9999 @eawoman As a Hull supporter I am expecting a misserable few weeks :-( \n", "\n", "[10000 rows x 2 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df = pd.DataFrame({'positive':ptweets, 'negative':ntweets})\\\n", " .unstack().reset_index().drop(columns=['level_1'])\\\n", " .rename(columns={0:'Tweet', 'level_0':'class'})\n", "with pd.option_context('max_colwidth', 0):\n", " display(df)" ] }, { "cell_type": "code", "execution_count": 9, "id": "21ed528d-b557-4141-93cd-a46683943791", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "class negative\n", "Tweet @Camy19994 FOLLOWED ME THANKS, AND\\n@justinbie...\n", "Name: 7777, dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[2777+5000]" ] }, { "cell_type": "code", "execution_count": 10, "id": "83581eb6-cb91-4538-906c-aebe881b46c1", "metadata": {}, "outputs": [], "source": [ "tweet = df.iloc[0]['Tweet']\n", "remove_old_style = lambda tweet: re.sub(r'^RT[\\s]+', '', tweet)\n", "remove_url = lambda tweet: re.sub(r'https?://[^\\s\\n\\r]+', '', tweet)\n", "remove_hash = lambda tweet: re.sub(r'#', '', tweet)\n", "tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,\n", " reduce_len=True)\n", " \n", "stopwords_english = stopwords.words('english')\n", "stemmer = PorterStemmer() \n", "skip_words = stopwords_english+list(string.punctuation)\n", "# stopwords_english\n", "# tweet, set(tokenizer.tokenize(remove_hash(tweet))) - set(stopwords_english)\n", "def process_tweet(tweet):\n", " clean_tweet = remove_hash(remove_url(remove_old_style(tweet)))\n", " tokens = tokenizer.tokenize(clean_tweet)\n", " return [stemmer.stem(word) for word in tokens \n", " if word not in skip_words]" ] }, { "cell_type": "code", "execution_count": 11, "id": "17d9752f-ebbd-424b-8c7d-757167026bfe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 [followfriday, top, engag, member, commun, wee...\n", "1 [hey, jame, odd, :/, pleas, call, contact, cen...\n", "2 [listen, last, night, :), bleed, amaz, track, ...\n", "3 [congrat, :)]\n", "4 [yeaaah, yipppi, accnt, verifi, rqst, succeed,...\n", " ... \n", "9995 [wanna, chang, avi, usanel, :(]\n", "9996 [puppi, broke, foot, :(]\n", "9997 [where', jaebum, babi, pictur, :(]\n", "9998 [mr, ahmad, maslan, cook, :(]\n", "9999 [hull, support, expect, misser, week, :-(]\n", "Name: PTweet, Length: 10000, dtype: object" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['PTweet']=df['Tweet'].apply(process_tweet)\n", "df['PTweet']" ] }, { "cell_type": "code", "execution_count": 12, "id": "e991eaed-27c4-494f-9b3e-dcc09d875209", "metadata": {}, "outputs": [], "source": [ "toks = df['PTweet'].sum()\n", "\n" ] }, { "cell_type": "markdown", "id": "f4741caa-c2a2-4a92-905b-93e67a5ea090", "metadata": {}, "source": [ "## Feature Engineering" ] }, { "cell_type": "markdown", "id": "017801d9-6e22-41fe-ba4c-88b308a6cf9b", "metadata": {}, "source": [ "### Building Frequency Dictionary" ] }, { "cell_type": "code", "execution_count": 13, "id": "9b7e36c6-2bda-4ec8-bad8-25fc45d05cd7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'positive': 32, 'negative': 6}" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[0,'PTweet']\n", "\n", "def contains_tok(tweet_tokens, tok):\n", " in_tokens = False\n", " if tok in tweet_tokens: in_tokens = True\n", " return in_tokens\n", "\n", "df[df.apply(lambda row: contains_tok(row['PTweet'], toks[1]), axis=1)]['class'].value_counts().to_dict()\n", "# contains_tok(df.loc[0,'PTweet'], toks[0])" ] }, { "cell_type": "code", "execution_count": 14, "id": "8aa8b87b-3d44-4f46-8642-06060c967bbe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['children',\n", " 'latin',\n", " 'bilal',\n", " 'leno',\n", " 'savag',\n", " 'hyung',\n", " 'braxton',\n", " 'statement',\n", " 'convinc',\n", " 'therefor']" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "toks = list(set(df['PTweet'].sum()))\n", "toks[:10]\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "7ea37889-14b1-4123-8f43-968d8feec5b5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " </tr>\n", " <tr>\n", " <th>word</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>children</th>\n", " <td>3.0</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>latin</th>\n", " <td>3.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>bilal</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>leno</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>savag</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>hyung</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>braxton</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>statement</th>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>convinc</th>\n", " <td>0.0</td>\n", " <td>3.0</td>\n", " </tr>\n", " <tr>\n", " <th>therefor</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " positive negative\n", "word \n", "children 3.0 2.0\n", "latin 3.0 0.0\n", "bilal 0.0 1.0\n", "leno 0.0 1.0\n", "savag 1.0 0.0\n", "hyung 0.0 1.0\n", "braxton 0.0 1.0\n", "statement 1.0 1.0\n", "convinc 0.0 3.0\n", "therefor 0.0 1.0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# toks[:10]\n", "pd.DataFrame([{'word':tok, \n", " **df[df.apply(lambda row: contains_tok(row['PTweet'], tok), axis=1)]['class'].value_counts().to_dict()} \n", " for tok in toks[:10]]).fillna(0).set_index('word')" ] }, { "cell_type": "code", "execution_count": 16, "id": "05bbbc2e-9684-4f6e-9696-8ffbeaf7733e", "metadata": {}, "outputs": [], "source": [ "def build_freqs(df):\n", " toks = list(set(df['PTweet'].sum()))\n", " return pd.DataFrame([{'word':tok, \n", " **df[df.apply(lambda row: contains_tok(row['PTweet'], tok), axis=1)]['class'].value_counts().to_dict()} \n", " for tok in toks]).fillna(0).set_index('word')\n", " " ] }, { "cell_type": "code", "execution_count": 17, "id": "c809697c-8a6f-4928-bcb1-344d304ec025", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " </tr>\n", " <tr>\n", " <th>word</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>children</th>\n", " <td>3.0</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>latin</th>\n", " <td>3.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>bilal</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>leno</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>savag</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " positive negative\n", "word \n", "children 3.0 2.0\n", "latin 3.0 0.0\n", "bilal 0.0 1.0\n", "leno 0.0 1.0\n", "savag 1.0 0.0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_freq=build_freqs(df); df_freq.head()" ] }, { "cell_type": "code", "execution_count": 18, "id": "eca4eaf8-d0d3-4a52-8ebe-257d6928755e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>10507.000000</td>\n", " <td>10507.000000</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>3.172361</td>\n", " <td>3.077567</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>37.991689</td>\n", " <td>44.787129</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>1.000000</td>\n", " <td>1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>1.000000</td>\n", " <td>1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>3541.000000</td>\n", " <td>4422.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " positive negative\n", "count 10507.000000 10507.000000\n", "mean 3.172361 3.077567\n", "std 37.991689 44.787129\n", "min 0.000000 0.000000\n", "25% 0.000000 0.000000\n", "50% 1.000000 1.000000\n", "75% 1.000000 1.000000\n", "max 3541.000000 4422.000000" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_freq.describe()" ] }, { "cell_type": "code", "execution_count": 19, "id": "bbd2cd17-7123-409c-9138-db9ea139371d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " </tr>\n", " <tr>\n", " <th>word</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>:)</th>\n", " <td>3541.0</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>:-)</th>\n", " <td>669.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>thank</th>\n", " <td>636.0</td>\n", " <td>105.0</td>\n", " </tr>\n", " <tr>\n", " <th>:d</th>\n", " <td>628.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>follow</th>\n", " <td>365.0</td>\n", " <td>169.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>💎</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>gate</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>goodmus</th>\n", " <td>0.0</td>\n", " <td>4.0</td>\n", " </tr>\n", " <tr>\n", " <th>322</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>3a2ad</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>10507 rows × 2 columns</p>\n", "</div>" ], "text/plain": [ " positive negative\n", "word \n", ":) 3541.0 2.0\n", ":-) 669.0 0.0\n", "thank 636.0 105.0\n", ":d 628.0 0.0\n", "follow 365.0 169.0\n", "... ... ...\n", "💎 0.0 1.0\n", "gate 0.0 1.0\n", "goodmus 0.0 4.0\n", "322 0.0 1.0\n", "3a2ad 0.0 1.0\n", "\n", "[10507 rows x 2 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_freq.sort_values(by='positive', ascending=False)" ] }, { "cell_type": "code", "execution_count": 20, "id": "c16bd5f0-a78c-407e-95f5-d05068f8318e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " </tr>\n", " <tr>\n", " <th>word</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>:(</th>\n", " <td>1.0</td>\n", " <td>4422.0</td>\n", " </tr>\n", " <tr>\n", " <th>:-(</th>\n", " <td>0.0</td>\n", " <td>481.0</td>\n", " </tr>\n", " <tr>\n", " <th>i'm</th>\n", " <td>173.0</td>\n", " <td>318.0</td>\n", " </tr>\n", " <tr>\n", " <th>miss</th>\n", " <td>27.0</td>\n", " <td>296.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>253.0</td>\n", " <td>284.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>swasa</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>soph</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>ef</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>cocoar</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>kw</th>\n", " <td>2.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>10507 rows × 2 columns</p>\n", "</div>" ], "text/plain": [ " positive negative\n", "word \n", ":( 1.0 4422.0\n", ":-( 0.0 481.0\n", "i'm 173.0 318.0\n", "miss 27.0 296.0\n", "... 253.0 284.0\n", "... ... ...\n", "swasa 1.0 0.0\n", "soph 1.0 0.0\n", "ef 1.0 0.0\n", "cocoar 1.0 0.0\n", "kw 2.0 0.0\n", "\n", "[10507 rows x 2 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_freq.sort_values(by='negative', ascending=False)" ] }, { "cell_type": "markdown", "id": "d4d65031-568e-4088-9067-96addcb09bcb", "metadata": {}, "source": [ "### Scoring Tweets" ] }, { "cell_type": "code", "execution_count": 21, "id": "a799d5ea-c40f-4f72-803a-c6b062a12b95", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'positive': 3737.0, 'negative': 69.0}" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweet_token=df['PTweet'][0]\n", "df_freq.loc[tweet_token].sum().to_dict()" ] }, { "cell_type": "code", "execution_count": 22, "id": "8c3ded61-c976-4300-bcc5-73d0430298c4", "metadata": {}, "outputs": [], "source": [ "# pd.DataFrame(df.apply(lambda row: ,axis=1))\n", "\n", "def score_tweet(tweet_tokens): \n", " l = df_freq.loc[tweet_tokens].sum().tolist()\n", " l.append(1.0)\n", " return l" ] }, { "cell_type": "code", "execution_count": 23, "id": "12ee70da-9cc8-47a3-9152-24b72039cfdd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[3737.0, 69.0, 1.0]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "score_tweet(tweet_token)" ] }, { "cell_type": "code", "execution_count": 24, "id": "e0979071-8ccc-4cb4-9ef0-ba4e193fc29f", "metadata": {}, "outputs": [], "source": [ "df['positive'], df['negative'], df['bias']=zip(*df['PTweet'].map(score_tweet))" ] }, { "cell_type": "code", "execution_count": 25, "id": "34b2a736-bdf3-4bf0-931a-fb4f969c8ff1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>class</th>\n", " <th>Tweet</th>\n", " <th>PTweet</th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " <th>bias</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>positive</td>\n", " <td>#FollowFriday @France_Inte @PKuchly57 @Milipol...</td>\n", " <td>[followfriday, top, engag, member, commun, wee...</td>\n", " <td>3737.0</td>\n", " <td>69.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>positive</td>\n", " <td>@Lamb2ja Hey James! How odd :/ Please call our...</td>\n", " <td>[hey, jame, odd, :/, pleas, call, contact, cen...</td>\n", " <td>4448.0</td>\n", " <td>473.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>positive</td>\n", " <td>@DespiteOfficial we had a listen last night :)...</td>\n", " <td>[listen, last, night, :), bleed, amaz, track, ...</td>\n", " <td>3728.0</td>\n", " <td>159.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>positive</td>\n", " <td>@97sides CONGRATS :)</td>\n", " <td>[congrat, :)]</td>\n", " <td>3562.0</td>\n", " <td>4.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>positive</td>\n", " <td>yeaaaah yippppy!!! my accnt verified rqst has...</td>\n", " <td>[yeaaah, yipppi, accnt, verifi, rqst, succeed,...</td>\n", " <td>3878.0</td>\n", " <td>273.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>9995</th>\n", " <td>negative</td>\n", " <td>I wanna change my avi but uSanele :(</td>\n", " <td>[wanna, chang, avi, usanel, :(]</td>\n", " <td>55.0</td>\n", " <td>4546.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>9996</th>\n", " <td>negative</td>\n", " <td>MY PUPPY BROKE HER FOOT :(</td>\n", " <td>[puppi, broke, foot, :(]</td>\n", " <td>3.0</td>\n", " <td>4439.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>9997</th>\n", " <td>negative</td>\n", " <td>where's all the jaebum baby pictures :((</td>\n", " <td>[where', jaebum, babi, pictur, :(]</td>\n", " <td>34.0</td>\n", " <td>4490.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>9998</th>\n", " <td>negative</td>\n", " <td>But but Mr Ahmad Maslan cooks too :( https://t...</td>\n", " <td>[mr, ahmad, maslan, cook, :(]</td>\n", " <td>9.0</td>\n", " <td>4434.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>9999</th>\n", " <td>negative</td>\n", " <td>@eawoman As a Hull supporter I am expecting a ...</td>\n", " <td>[hull, support, expect, misser, week, :-(]</td>\n", " <td>116.0</td>\n", " <td>565.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>10000 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " class Tweet \\\n", "0 positive #FollowFriday @France_Inte @PKuchly57 @Milipol... \n", "1 positive @Lamb2ja Hey James! How odd :/ Please call our... \n", "2 positive @DespiteOfficial we had a listen last night :)... \n", "3 positive @97sides CONGRATS :) \n", "4 positive yeaaaah yippppy!!! my accnt verified rqst has... \n", "... ... ... \n", "9995 negative I wanna change my avi but uSanele :( \n", "9996 negative MY PUPPY BROKE HER FOOT :( \n", "9997 negative where's all the jaebum baby pictures :(( \n", "9998 negative But but Mr Ahmad Maslan cooks too :( https://t... \n", "9999 negative @eawoman As a Hull supporter I am expecting a ... \n", "\n", " PTweet positive negative \\\n", "0 [followfriday, top, engag, member, commun, wee... 3737.0 69.0 \n", "1 [hey, jame, odd, :/, pleas, call, contact, cen... 4448.0 473.0 \n", "2 [listen, last, night, :), bleed, amaz, track, ... 3728.0 159.0 \n", "3 [congrat, :)] 3562.0 4.0 \n", "4 [yeaaah, yipppi, accnt, verifi, rqst, succeed,... 3878.0 273.0 \n", "... ... ... ... \n", "9995 [wanna, chang, avi, usanel, :(] 55.0 4546.0 \n", "9996 [puppi, broke, foot, :(] 3.0 4439.0 \n", "9997 [where', jaebum, babi, pictur, :(] 34.0 4490.0 \n", "9998 [mr, ahmad, maslan, cook, :(] 9.0 4434.0 \n", "9999 [hull, support, expect, misser, week, :-(] 116.0 565.0 \n", "\n", " bias \n", "0 1.0 \n", "1 1.0 \n", "2 1.0 \n", "3 1.0 \n", "4 1.0 \n", "... ... \n", "9995 1.0 \n", "9996 1.0 \n", "9997 1.0 \n", "9998 1.0 \n", "9999 1.0 \n", "\n", "[10000 rows x 6 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 26, "id": "0acfab4f-8b9b-4f1b-8a42-477e406a5c6c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " </tr>\n", " <tr>\n", " <th>word</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>children</th>\n", " <td>3.0</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>latin</th>\n", " <td>3.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>bilal</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>leno</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>savag</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>smoak</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>siguro</th>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>kapan</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>fever</th>\n", " <td>2.0</td>\n", " <td>7.0</td>\n", " </tr>\n", " <tr>\n", " <th>3a2ad</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>10507 rows × 2 columns</p>\n", "</div>" ], "text/plain": [ " positive negative\n", "word \n", "children 3.0 2.0\n", "latin 3.0 0.0\n", "bilal 0.0 1.0\n", "leno 0.0 1.0\n", "savag 1.0 0.0\n", "... ... ...\n", "smoak 1.0 0.0\n", "siguro 1.0 0.0\n", "kapan 0.0 1.0\n", "fever 2.0 7.0\n", "3a2ad 0.0 1.0\n", "\n", "[10507 rows x 2 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_freq" ] }, { "cell_type": "markdown", "id": "f4388cbb-1147-4af5-9cbc-0755fb94e386", "metadata": {}, "source": [ "### Visualizing Words" ] }, { "cell_type": "code", "execution_count": 27, "id": "9b318d53-0918-4395-bf62-19a11c210902", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['happi',\n", " 'merri',\n", " 'nice',\n", " 'good',\n", " 'bad',\n", " 'sad',\n", " 'mad',\n", " 'best',\n", " 'pretti',\n", " '❤',\n", " ':)',\n", " ':(',\n", " '😒',\n", " '😬',\n", " '😄',\n", " '😍',\n", " '♛',\n", " 'song',\n", " 'idea',\n", " 'power',\n", " 'play',\n", " 'magnific']" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keys = ['happi', 'merri', 'nice', 'good', 'bad', 'sad', 'mad', 'best', 'pretti',\n", " '❤', ':)', ':(', '😒', '😬', '😄', '😍', '♛',\n", " 'song', 'idea', 'power', 'play', 'magnific']\n", "\n", "sel_keys = [ k for k in keys if k in df_freq.index]\n", "sel_keys" ] }, { "cell_type": "code", "execution_count": 28, "id": "86a553d1-6c31-4e2a-81da-6f615c1861e4", "metadata": {}, "outputs": [], "source": [ "sel_df = df_freq.loc[sel_keys]" ] }, { "cell_type": "code", "execution_count": null, "id": "20173321-6e5b-4c9c-a75a-7b85fafbe9a0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 29, "id": "7ab4223d-059b-42da-8e0a-12ab86cc92ab", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.\n" ] }, { "data": { "text/plain": [ "[<matplotlib.lines.Line2D at 0x147a3b2b1670>]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "fig, ax = plt.subplots()\n", "\n", "sel_df.plot.scatter(x='positive', y='negative', loglog=True, ax=ax)\n", "for row in sel_df.iterrows():\n", " ax.annotate(row[0], (row[1]['positive'], row[1]['negative'])) \n", " \n", "ax.plot([0, 9000], [0, 9000], color = 'red')\n", "# fig.canvas.draw()" ] }, { "cell_type": "markdown", "id": "e0958bf2-a8f0-451b-a4e2-106e959371b3", "metadata": {}, "source": [ "## Modeling- Logistics Regression" ] }, { "cell_type": "code", "execution_count": null, "id": "ed6786ee-4881-4a94-8634-6b6d735a0fe2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>class</th>\n", " <th>Tweet</th>\n", " <th>PTweet</th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " <th>bias</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>positive</td>\n", " <td>#FollowFriday @France_Inte @PKuchly57 @Milipol...</td>\n", " <td>[followfriday, top, engag, member, commun, wee...</td>\n", " <td>3737.0</td>\n", " <td>69.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>positive</td>\n", " <td>@Lamb2ja Hey James! How odd :/ Please call our...</td>\n", " <td>[hey, jame, odd, :/, pleas, call, contact, cen...</td>\n", " <td>4448.0</td>\n", " <td>473.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>positive</td>\n", " <td>@DespiteOfficial we had a listen last night :)...</td>\n", " <td>[listen, last, night, :), bleed, amaz, track, ...</td>\n", " <td>3728.0</td>\n", " <td>159.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>positive</td>\n", " <td>@97sides CONGRATS :)</td>\n", " <td>[congrat, :)]</td>\n", " <td>3562.0</td>\n", " <td>4.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>positive</td>\n", " <td>yeaaaah yippppy!!! my accnt verified rqst has...</td>\n", " <td>[yeaaah, yipppi, accnt, verifi, rqst, succeed,...</td>\n", " <td>3878.0</td>\n", " <td>273.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " class Tweet PTweet positive negative bias\n", "0 positive #FollowFriday @France_Inte @PKuchly57 @Milipol... [followfriday, top, engag, member, commun, wee... 3737.0 69.0 1.0\n", "1 positive @Lamb2ja Hey James! How odd :/ Please call our... [hey, jame, odd, :/, pleas, call, contact, cen... 4448.0 473.0 1.0\n", "2 positive @DespiteOfficial we had a listen last night :)... [listen, last, night, :), bleed, amaz, track, ... 3728.0 159.0 1.0\n", "3 positive @97sides CONGRATS :) [congrat, :)] 3562.0 4.0 1.0\n", "4 positive yeaaaah yippppy!!! my accnt verified rqst has... [yeaaah, yipppi, accnt, verifi, rqst, succeed,... 3878.0 273.0 1.0" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "83673fe1-afce-4c4b-9753-14d5aa0643d1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>class</th>\n", " <th>Tweet</th>\n", " <th>PTweet</th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " <th>bias</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>5000</th>\n", " <td>negative</td>\n", " <td>hopeless for tmr :(</td>\n", " <td>[hopeless, tmr, :(]</td>\n", " <td>2.0</td>\n", " <td>4427.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>5001</th>\n", " <td>negative</td>\n", " <td>Everything in the kids section of IKEA is so c...</td>\n", " <td>[everyth, kid, section, ikea, cute, shame, i'm...</td>\n", " <td>316.0</td>\n", " <td>4917.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>5002</th>\n", " <td>negative</td>\n", " <td>@Hegelbon That heart sliding into the waste ba...</td>\n", " <td>[heart, slide, wast, basket, :(]</td>\n", " <td>20.0</td>\n", " <td>4456.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>5003</th>\n", " <td>negative</td>\n", " <td>“@ketchBurning: I hate Japanese call him \"bani...</td>\n", " <td>[“, hate, japanes, call, bani, :(, :(, ”]</td>\n", " <td>67.0</td>\n", " <td>8962.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>5004</th>\n", " <td>negative</td>\n", " <td>Dang starting next week I have \"work\" :(</td>\n", " <td>[dang, start, next, week, work, :(]</td>\n", " <td>303.0</td>\n", " <td>4690.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>8995</th>\n", " <td>negative</td>\n", " <td>Amelia didnt stalk my twitter :(</td>\n", " <td>[amelia, didnt, stalk, twitter, :(]</td>\n", " <td>34.0</td>\n", " <td>4479.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>8996</th>\n", " <td>negative</td>\n", " <td>oh, i missed the broadcast. : (</td>\n", " <td>[oh, miss, broadcast]</td>\n", " <td>79.0</td>\n", " <td>393.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>8997</th>\n", " <td>negative</td>\n", " <td>i really can't stream on melon i feel useless :-(</td>\n", " <td>[realli, can't, stream, melon, feel, useless, ...</td>\n", " <td>174.0</td>\n", " <td>958.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>8998</th>\n", " <td>negative</td>\n", " <td>I need to stop looking at old soccer pictures :(</td>\n", " <td>[need, stop, look, old, soccer, pictur, :(]</td>\n", " <td>251.0</td>\n", " <td>4703.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>8999</th>\n", " <td>negative</td>\n", " <td>Got an interview for the job that I want but t...</td>\n", " <td>[got, interview, job, want, rang, tuesday, int...</td>\n", " <td>236.0</td>\n", " <td>4800.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>4000 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " class Tweet PTweet positive negative bias\n", "5000 negative hopeless for tmr :( [hopeless, tmr, :(] 2.0 4427.0 1.0\n", "5001 negative Everything in the kids section of IKEA is so c... [everyth, kid, section, ikea, cute, shame, i'm... 316.0 4917.0 1.0\n", "5002 negative @Hegelbon That heart sliding into the waste ba... [heart, slide, wast, basket, :(] 20.0 4456.0 1.0\n", "5003 negative “@ketchBurning: I hate Japanese call him \"bani... [“, hate, japanes, call, bani, :(, :(, ”] 67.0 8962.0 1.0\n", "5004 negative Dang starting next week I have \"work\" :( [dang, start, next, week, work, :(] 303.0 4690.0 1.0\n", "... ... ... ... ... ... ...\n", "8995 negative Amelia didnt stalk my twitter :( [amelia, didnt, stalk, twitter, :(] 34.0 4479.0 1.0\n", "8996 negative oh, i missed the broadcast. : ( [oh, miss, broadcast] 79.0 393.0 1.0\n", "8997 negative i really can't stream on melon i feel useless :-( [realli, can't, stream, melon, feel, useless, ... 174.0 958.0 1.0\n", "8998 negative I need to stop looking at old soccer pictures :( [need, stop, look, old, soccer, pictur, :(] 251.0 4703.0 1.0\n", "8999 negative Got an interview for the job that I want but t... [got, interview, job, want, rang, tuesday, int... 236.0 4800.0 1.0\n", "\n", "[4000 rows x 6 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:4000]\n", "df.iloc[5000:9000]" ] }, { "cell_type": "code", "execution_count": null, "id": "697dc070-9155-4413-8f11-b39b6c92477d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>class</th>\n", " <th>Tweet</th>\n", " <th>PTweet</th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " <th>bias</th>\n", " <th>sentiment</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>positive</td>\n", " <td>#FollowFriday @France_Inte @PKuchly57 @Milipol...</td>\n", " <td>[followfriday, top, engag, member, commun, wee...</td>\n", " <td>3737.0</td>\n", " <td>69.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>positive</td>\n", " <td>@Lamb2ja Hey James! How odd :/ Please call our...</td>\n", " <td>[hey, jame, odd, :/, pleas, call, contact, cen...</td>\n", " <td>4448.0</td>\n", " <td>473.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>positive</td>\n", " <td>@DespiteOfficial we had a listen last night :)...</td>\n", " <td>[listen, last, night, :), bleed, amaz, track, ...</td>\n", " <td>3728.0</td>\n", " <td>159.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>positive</td>\n", " <td>@97sides CONGRATS :)</td>\n", " <td>[congrat, :)]</td>\n", " <td>3562.0</td>\n", " <td>4.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>positive</td>\n", " <td>yeaaaah yippppy!!! my accnt verified rqst has...</td>\n", " <td>[yeaaah, yipppi, accnt, verifi, rqst, succeed,...</td>\n", " <td>3878.0</td>\n", " <td>273.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " class Tweet PTweet positive negative bias sentiment\n", "0 positive #FollowFriday @France_Inte @PKuchly57 @Milipol... [followfriday, top, engag, member, commun, wee... 3737.0 69.0 1.0 0\n", "1 positive @Lamb2ja Hey James! How odd :/ Please call our... [hey, jame, odd, :/, pleas, call, contact, cen... 4448.0 473.0 1.0 0\n", "2 positive @DespiteOfficial we had a listen last night :)... [listen, last, night, :), bleed, amaz, track, ... 3728.0 159.0 1.0 0\n", "3 positive @97sides CONGRATS :) [congrat, :)] 3562.0 4.0 1.0 0\n", "4 positive yeaaaah yippppy!!! my accnt verified rqst has... [yeaaah, yipppi, accnt, verifi, rqst, succeed,... 3878.0 273.0 1.0 0" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a803ef1f-1213-440d-a946-81aafb664035", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>class</th>\n", " <th>Tweet</th>\n", " <th>PTweet</th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " <th>bias</th>\n", " <th>sentiment</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>positive</td>\n", " <td>#FollowFriday @France_Inte @PKuchly57 @Milipol...</td>\n", " <td>[followfriday, top, engag, member, commun, wee...</td>\n", " <td>3737.0</td>\n", " <td>69.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>positive</td>\n", " <td>@Lamb2ja Hey James! How odd :/ Please call our...</td>\n", " <td>[hey, jame, odd, :/, pleas, call, contact, cen...</td>\n", " <td>4448.0</td>\n", " <td>473.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>positive</td>\n", " <td>@DespiteOfficial we had a listen last night :)...</td>\n", " <td>[listen, last, night, :), bleed, amaz, track, ...</td>\n", " <td>3728.0</td>\n", " <td>159.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>positive</td>\n", " <td>@97sides CONGRATS :)</td>\n", " <td>[congrat, :)]</td>\n", " <td>3562.0</td>\n", " <td>4.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>positive</td>\n", " <td>yeaaaah yippppy!!! my accnt verified rqst has...</td>\n", " <td>[yeaaah, yipppi, accnt, verifi, rqst, succeed,...</td>\n", " <td>3878.0</td>\n", " <td>273.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>9995</th>\n", " <td>negative</td>\n", " <td>I wanna change my avi but uSanele :(</td>\n", " <td>[wanna, chang, avi, usanel, :(]</td>\n", " <td>55.0</td>\n", " <td>4546.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>9996</th>\n", " <td>negative</td>\n", " <td>MY PUPPY BROKE HER FOOT :(</td>\n", " <td>[puppi, broke, foot, :(]</td>\n", " <td>3.0</td>\n", " <td>4439.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>9997</th>\n", " <td>negative</td>\n", " <td>where's all the jaebum baby pictures :((</td>\n", " <td>[where', jaebum, babi, pictur, :(]</td>\n", " <td>34.0</td>\n", " <td>4490.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>9998</th>\n", " <td>negative</td>\n", " <td>But but Mr Ahmad Maslan cooks too :( https://t...</td>\n", " <td>[mr, ahmad, maslan, cook, :(]</td>\n", " <td>9.0</td>\n", " <td>4434.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>9999</th>\n", " <td>negative</td>\n", " <td>@eawoman As a Hull supporter I am expecting a ...</td>\n", " <td>[hull, support, expect, misser, week, :-(]</td>\n", " <td>116.0</td>\n", " <td>565.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>10000 rows × 7 columns</p>\n", "</div>" ], "text/plain": [ " class Tweet PTweet positive negative bias sentiment\n", "0 positive #FollowFriday @France_Inte @PKuchly57 @Milipol... [followfriday, top, engag, member, commun, wee... 3737.0 69.0 1.0 1\n", "1 positive @Lamb2ja Hey James! How odd :/ Please call our... [hey, jame, odd, :/, pleas, call, contact, cen... 4448.0 473.0 1.0 1\n", "2 positive @DespiteOfficial we had a listen last night :)... [listen, last, night, :), bleed, amaz, track, ... 3728.0 159.0 1.0 1\n", "3 positive @97sides CONGRATS :) [congrat, :)] 3562.0 4.0 1.0 1\n", "4 positive yeaaaah yippppy!!! my accnt verified rqst has... [yeaaah, yipppi, accnt, verifi, rqst, succeed,... 3878.0 273.0 1.0 1\n", "... ... ... ... ... ... ... ...\n", "9995 negative I wanna change my avi but uSanele :( [wanna, chang, avi, usanel, :(] 55.0 4546.0 1.0 0\n", "9996 negative MY PUPPY BROKE HER FOOT :( [puppi, broke, foot, :(] 3.0 4439.0 1.0 0\n", "9997 negative where's all the jaebum baby pictures :(( [where', jaebum, babi, pictur, :(] 34.0 4490.0 1.0 0\n", "9998 negative But but Mr Ahmad Maslan cooks too :( https://t... [mr, ahmad, maslan, cook, :(] 9.0 4434.0 1.0 0\n", "9999 negative @eawoman As a Hull supporter I am expecting a ... [hull, support, expect, misser, week, :-(] 116.0 565.0 1.0 0\n", "\n", "[10000 rows x 7 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['sentiment'] = 0\n", "df.loc[df['class']=='positive', 'sentiment']=1\n", "df" ] }, { "cell_type": "code", "execution_count": null, "id": "baa549dc-39a1-4921-bb10-911174662a62", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>class</th>\n", " <th>Tweet</th>\n", " <th>PTweet</th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " <th>bias</th>\n", " <th>sentiment</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>positive</td>\n", " <td>#FollowFriday @France_Inte @PKuchly57 @Milipol...</td>\n", " <td>[followfriday, top, engag, member, commun, wee...</td>\n", " <td>3737.0</td>\n", " <td>69.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>positive</td>\n", " <td>@Lamb2ja Hey James! How odd :/ Please call our...</td>\n", " <td>[hey, jame, odd, :/, pleas, call, contact, cen...</td>\n", " <td>4448.0</td>\n", " <td>473.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>positive</td>\n", " <td>@DespiteOfficial we had a listen last night :)...</td>\n", " <td>[listen, last, night, :), bleed, amaz, track, ...</td>\n", " <td>3728.0</td>\n", " <td>159.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>positive</td>\n", " <td>@97sides CONGRATS :)</td>\n", " <td>[congrat, :)]</td>\n", " <td>3562.0</td>\n", " <td>4.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>positive</td>\n", " <td>yeaaaah yippppy!!! my accnt verified rqst has...</td>\n", " <td>[yeaaah, yipppi, accnt, verifi, rqst, succeed,...</td>\n", " <td>3878.0</td>\n", " <td>273.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>8995</th>\n", " <td>negative</td>\n", " <td>Amelia didnt stalk my twitter :(</td>\n", " <td>[amelia, didnt, stalk, twitter, :(]</td>\n", " <td>34.0</td>\n", " <td>4479.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>8996</th>\n", " <td>negative</td>\n", " <td>oh, i missed the broadcast. : (</td>\n", " <td>[oh, miss, broadcast]</td>\n", " <td>79.0</td>\n", " <td>393.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>8997</th>\n", " <td>negative</td>\n", " <td>i really can't stream on melon i feel useless :-(</td>\n", " <td>[realli, can't, stream, melon, feel, useless, ...</td>\n", " <td>174.0</td>\n", " <td>958.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>8998</th>\n", " <td>negative</td>\n", " <td>I need to stop looking at old soccer pictures :(</td>\n", " <td>[need, stop, look, old, soccer, pictur, :(]</td>\n", " <td>251.0</td>\n", " <td>4703.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>8999</th>\n", " <td>negative</td>\n", " <td>Got an interview for the job that I want but t...</td>\n", " <td>[got, interview, job, want, rang, tuesday, int...</td>\n", " <td>236.0</td>\n", " <td>4800.0</td>\n", " <td>1.0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>8000 rows × 7 columns</p>\n", "</div>" ], "text/plain": [ " class Tweet PTweet positive negative bias sentiment\n", "0 positive #FollowFriday @France_Inte @PKuchly57 @Milipol... [followfriday, top, engag, member, commun, wee... 3737.0 69.0 1.0 1\n", "1 positive @Lamb2ja Hey James! How odd :/ Please call our... [hey, jame, odd, :/, pleas, call, contact, cen... 4448.0 473.0 1.0 1\n", "2 positive @DespiteOfficial we had a listen last night :)... [listen, last, night, :), bleed, amaz, track, ... 3728.0 159.0 1.0 1\n", "3 positive @97sides CONGRATS :) [congrat, :)] 3562.0 4.0 1.0 1\n", "4 positive yeaaaah yippppy!!! my accnt verified rqst has... [yeaaah, yipppi, accnt, verifi, rqst, succeed,... 3878.0 273.0 1.0 1\n", "... ... ... ... ... ... ... ...\n", "8995 negative Amelia didnt stalk my twitter :( [amelia, didnt, stalk, twitter, :(] 34.0 4479.0 1.0 0\n", "8996 negative oh, i missed the broadcast. : ( [oh, miss, broadcast] 79.0 393.0 1.0 0\n", "8997 negative i really can't stream on melon i feel useless :-( [realli, can't, stream, melon, feel, useless, ... 174.0 958.0 1.0 0\n", "8998 negative I need to stop looking at old soccer pictures :( [need, stop, look, old, soccer, pictur, :(] 251.0 4703.0 1.0 0\n", "8999 negative Got an interview for the job that I want but t... [got, interview, job, want, rang, tuesday, int... 236.0 4800.0 1.0 0\n", "\n", "[8000 rows x 7 columns]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df = pd.concat([df.iloc[:4000], df.iloc[5000:9000]]); train_df" ] }, { "cell_type": "code", "execution_count": null, "id": "cf7e4e79-3889-47cd-b94e-4a883e719bda", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>class</th>\n", " <th>Tweet</th>\n", " <th>PTweet</th>\n", " <th>positive</th>\n", " <th>negative</th>\n", " <th>bias</th>\n", " <th>sentiment</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>4995</th>\n", " <td>positive</td>\n", " <td>@chriswiggin3 Chris, that's great to hear :) D...</td>\n", " <td>[chri, that', great, hear, :), due, time, remi...</td>\n", " <td>4005.0</td>\n", " <td>337.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4996</th>\n", " <td>positive</td>\n", " <td>@RachelLiskeard Thanks for the shout-out :) It...</td>\n", " <td>[thank, shout-out, :), great, aboard]</td>\n", " <td>4349.0</td>\n", " <td>129.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4997</th>\n", " <td>positive</td>\n", " <td>@side556 Hey! :) Long time no talk...</td>\n", " <td>[hey, :), long, time, talk, ...]</td>\n", " <td>4075.0</td>\n", " <td>556.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4998</th>\n", " <td>positive</td>\n", " <td>@staybubbly69 as Matt would say. WELCOME TO AD...</td>\n", " <td>[matt, would, say, welcom, adulthood, ..., :)]</td>\n", " <td>4017.0</td>\n", " <td>420.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4999</th>\n", " <td>positive</td>\n", " <td>@DanielOConnel18 you could say he will have eg...</td>\n", " <td>[could, say, egg, face, :-)]</td>\n", " <td>776.0</td>\n", " <td>154.0</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " class Tweet PTweet positive negative bias sentiment\n", "4995 positive @chriswiggin3 Chris, that's great to hear :) D... [chri, that', great, hear, :), due, time, remi... 4005.0 337.0 1.0 1\n", "4996 positive @RachelLiskeard Thanks for the shout-out :) It... [thank, shout-out, :), great, aboard] 4349.0 129.0 1.0 1\n", "4997 positive @side556 Hey! :) Long time no talk... [hey, :), long, time, talk, ...] 4075.0 556.0 1.0 1\n", "4998 positive @staybubbly69 as Matt would say. WELCOME TO AD... [matt, would, say, welcom, adulthood, ..., :)] 4017.0 420.0 1.0 1\n", "4999 positive @DanielOConnel18 you could say he will have eg... [could, say, egg, face, :-)] 776.0 154.0 1.0 1" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df = df.iloc[list(set(df.index.tolist()) - set(train_df.index.tolist()))]\n", "test_df.tail()" ] }, { "cell_type": "markdown", "id": "832a09c8-82d7-40be-b20b-e30449eab141", "metadata": {}, "source": [ "### Train test Split" ] }, { "cell_type": "code", "execution_count": null, "id": "a2a1fff4-9e77-4a7b-9b6a-9e5c32306462", "metadata": {}, "outputs": [], "source": [ "X = train_df[['bias', 'positive', 'negative']]\n", "y = train_df['sentiment']" ] }, { "cell_type": "code", "execution_count": null, "id": "210b94f2-69b0-4f2f-bd73-7a346a0ab559", "metadata": {}, "outputs": [], "source": [ "X_test = test_df[['bias', 'positive', 'negative']]\n", "y_test = test_df['sentiment']" ] }, { "cell_type": "markdown", "id": "d789d4b6-3b8c-4492-b88f-45a559d89aef", "metadata": {}, "source": [ "### Model training" ] }, { "cell_type": "code", "execution_count": null, "id": "4881ff7b-b52d-45ce-9721-25572c4655da", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LogisticRegression()" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = LogisticRegression()\n", "model.fit(X , y)" ] }, { "cell_type": "markdown", "id": "8d6b78a3-5f9c-4ec9-8b87-c92e63eef0fa", "metadata": {}, "source": [ "### Model Scoring" ] }, { "cell_type": "code", "execution_count": null, "id": "f0551706-ebff-47dd-aa14-deb48060a211", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.24050227, 0.00685412, -0.0077651 ]])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.coef_" ] }, { "cell_type": "code", "execution_count": null, "id": "d33e693b-115a-4bd2-9b7e-1a4afb400d1d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.992875" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metrics.accuracy_score(y, model.predict(X)) # Score on training " ] }, { "cell_type": "code", "execution_count": null, "id": "0aefe999-6613-4e2d-a787-5a632d1a60be", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.994" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metrics.accuracy_score(y_test, model.predict(X_test))" ] }, { "cell_type": "markdown", "id": "42ac0dff-759b-4af6-868e-94d3e638c6fa", "metadata": {}, "source": [ "### Visualizing Model" ] }, { "cell_type": "code", "execution_count": null, "id": "e1424e46-3587-4118-b222-d3c5126026fd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.00685412, 0.01370825, 0.02056237])" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "thetas = model.coef_.reshape(3,1)" ] }, { "cell_type": "code", "execution_count": null, "id": "a093246e-2e38-494d-98d6-a09056343d04", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "28424.0" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def negative(thetas, pos):\n", " return ( -thetas[0] - thetas[1]*pos)/thetas[2]\n", "\n", "def direction(thetas, pos):\n", " return pos * thetas[2] / thetas[1]\n", "\n", "# negative(thetas, np.array([1,2,3]))\n", "# direction(thetas, np.array([1,2,3]))" ] }, { "cell_type": "code", "execution_count": null, "id": "ce354a76-145b-44d6-888a-f028ed16b2a3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/anaconda/envs/aiking/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "<AxesSubplot:xlabel='positive', ylabel='negative'>" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "palette ={0:'red', 1:'green'}\n", "offset=5000\n", "ax = sns.scatterplot(data=df, x='positive',y='negative', hue='sentiment', palette=palette, marker='.')\n", "pos = np.arange(0, int(X.abs().max().max()), 1); pos\n", "\n", "sns.lineplot(pos, negative(thetas, pos), ax=ax)\n", "# ax.arrow(offset, negative(thetas, offset), offset, direction(thetas, offset), head_width=500, head_length=500, fc='g', ec='g')\n", "# # Plot a red line pointing to the negative direction\n", "# ax.arrow(offset, negative(thetas, offset), -offset, -direction(thetas, offset), head_width=500, head_length=500, fc='r', ec='r')" ] }, { "cell_type": "code", "execution_count": null, "id": "ebe1201c-3a0b-4f61-9eba-67b16c6b8dc1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6230a779-a976-4fe2-b9c5-3295b26a8d27", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "bf5112f1-0c81-445e-bb79-1f62d55c21ca", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }