\n",
"## \n",
" **Hints**\n",
"

\n",
"

"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# UNQ_C8 GRADED FUNCTION: get_ratio\n",
"\n",
"def get_ratio(freqs, word):\n",
" '''\n",
" Input:\n",
" freqs: dictionary containing the words\n",
"\n",
" Output: a dictionary with keys 'positive', 'negative', and 'ratio'.\n",
" Example: {'positive': 10, 'negative': 20, 'ratio': 0.5}\n",
" '''\n",
" pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}\n",
" ### START CODE HERE ###\n",
" # use lookup() to find positive counts for the word (denoted by the integer 1)\n",
" pos_neg_ratio['positive'] = freqs.get((word,1.0),0)\n",
" \n",
" # use lookup() to find negative counts for the word (denoted by integer 0)\n",
" pos_neg_ratio['negative'] = freqs.get((word,0.0),0)\n",
" \n",
" # calculate the ratio of positive to negative counts for the word\n",
" pos_neg_ratio['ratio'] = (pos_neg_ratio['positive']+1) / (pos_neg_ratio['negative']+1)\n",
" ### END CODE HERE ###\n",
" return pos_neg_ratio\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'positive': 162, 'negative': 18, 'ratio': 8.578947368421053}"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_ratio(freqs, 'happi')"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[92m All tests passed\n"
]
}
],
"source": [
"# Test your function\n",
"w2_unittest.test_get_ratio(get_ratio, freqs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Implement get_words_by_threshold(freqs,label,threshold)\n",
"\n",
"* If we set the label to 1, then we'll look for all words whose threshold of positive/negative is at least as high as that threshold, or higher.\n",
"* If we set the label to 0, then we'll look for all words whose threshold of positive/negative is at most as low as the given threshold, or lower.\n",
"* Use the `get_ratio` function to get a dictionary containing the positive count, negative count, and the ratio of positive to negative counts.\n",
"* Append the `get_ratio` dictionary inside another dictinoary, where the key is the word, and the value is the dictionary `pos_neg_ratio` that is returned by the `get_ratio` function.\n",
"An example key-value pair would have this structure:\n",
"```\n",
"{'happi':\n",
" {'positive': 10, 'negative': 20, 'ratio': 0.524}\n",
"}\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# UNQ_C9 GRADED FUNCTION: get_words_by_threshold\n",
"\n",
"def get_words_by_threshold(freqs, label, threshold, get_ratio=get_ratio):\n",
" '''\n",
" Input:\n",
" freqs: dictionary of words\n",
" label: 1 for positive, 0 for negative\n",
" threshold: ratio that will be used as the cutoff for including a word in the returned dictionary\n",
" Output:\n",
" word_list: dictionary containing the word and information on its positive count, negative count, and ratio of positive to negative counts.\n",
" example of a key value pair:\n",
" {'happi':\n",
" {'positive': 10, 'negative': 20, 'ratio': 0.5}\n",
" }\n",
" '''\n",
" word_list = {}\n",
"\n",
" ### START CODE HERE ###\n",
" for key in freqs.keys():\n",
" word, _ = key\n",
"\n",
" # get the positive/negative ratio for a word\n",
" pos_neg_ratio = get_ratio(freqs, word)\n",
"\n",
" # if the label is 1 and the ratio is greater than or equal to the threshold...\n",
" if label == 1 and pos_neg_ratio['ratio'] >= threshold:\n",
" \n",
" # Add the pos_neg_ratio to the dictionary\n",
" word_list[word] = pos_neg_ratio\n",
"\n",
" # If the label is 0 and the pos_neg_ratio is less than or equal to the threshold...\n",
" elif label == 0 and pos_neg_ratio['ratio'] <= threshold:\n",
" \n",
" # Add the pos_neg_ratio to the dictionary\n",
" word_list[word] = pos_neg_ratio\n",
"\n",
" # otherwise, do not include this word in the list (do nothing)\n",
"\n",
" ### END CODE HERE ###\n",
" return word_list\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{':(': {'positive': 1, 'negative': 3675, 'ratio': 0.000544069640914037},\n",
" ':-(': {'positive': 0, 'negative': 386, 'ratio': 0.002583979328165375},\n",
" 'zayniscomingbackonjuli': {'positive': 0, 'negative': 19, 'ratio': 0.05},\n",
" '26': {'positive': 0, 'negative': 20, 'ratio': 0.047619047619047616},\n",
" '>:(': {'positive': 0, 'negative': 43, 'ratio': 0.022727272727272728},\n",
" 'lost': {'positive': 0, 'negative': 19, 'ratio': 0.05},\n",
" '♛': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},\n",
" '》': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},\n",
" 'beli̇ev': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},\n",
" 'wi̇ll': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},\n",
" 'justi̇n': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},\n",
" 'ｓｅｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},\n",
" 'ｍｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}}"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Test your function: find negative words at or below a threshold\n",
"get_words_by_threshold(freqs, label=0, threshold=0.05)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'followfriday': {'positive': 23, 'negative': 0, 'ratio': 24.0},\n",
" 'commun': {'positive': 27, 'negative': 1, 'ratio': 14.0},\n",
" ':)': {'positive': 2960, 'negative': 2, 'ratio': 987.0},\n",
" 'flipkartfashionfriday': {'positive': 16, 'negative': 0, 'ratio': 17.0},\n",
" ':D': {'positive': 523, 'negative': 0, 'ratio': 524.0},\n",
" ':p': {'positive': 104, 'negative': 0, 'ratio': 105.0},\n",
" 'influenc': {'positive': 16, 'negative': 0, 'ratio': 17.0},\n",
" ':-)': {'positive': 552, 'negative': 0, 'ratio': 553.0},\n",
" \"here'\": {'positive': 20, 'negative': 0, 'ratio': 21.0},\n",
" 'youth': {'positive': 14, 'negative': 0, 'ratio': 15.0},\n",
" 'bam': {'positive': 44, 'negative': 0, 'ratio': 45.0},\n",
" 'warsaw': {'positive': 44, 'negative': 0, 'ratio': 45.0},\n",
" 'shout': {'positive': 11, 'negative': 0, 'ratio': 12.0},\n",
" ';)': {'positive': 22, 'negative': 0, 'ratio': 23.0},\n",
" 'stat': {'positive': 51, 'negative': 0, 'ratio': 52.0},\n",
" 'arriv': {'positive': 57, 'negative': 4, 'ratio': 11.6},\n",
" 'glad': {'positive': 41, 'negative': 2, 'ratio': 14.0},\n",
" 'blog': {'positive': 27, 'negative': 0, 'ratio': 28.0},\n",
" 'fav': {'positive': 11, 'negative': 0, 'ratio': 12.0},\n",
" 'fantast': {'positive': 9, 'negative': 0, 'ratio': 10.0},\n",
" 'fback': {'positive': 26, 'negative': 0, 'ratio': 27.0},\n",
" 'pleasur': {'positive': 10, 'negative': 0, 'ratio': 11.0},\n",
" '←': {'positive': 9, 'negative': 0, 'ratio': 10.0},\n",
" 'aqui': {'positive': 9, 'negative': 0, 'ratio': 10.0}}"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Test your function; find positive words at or above a threshold\n",
"get_words_by_threshold(freqs, label=1, threshold=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notice the difference between the positive and negative ratios. Emojis like :( and words like 'me' tend to have a negative connotation. Other words like glad, community, arrives, tend to be found in the positive tweets."
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[92m All tests passed\n"
]
}
],
"source": [
"# Test your function\n",
"w2_unittest.test_get_words_by_threshold(get_words_by_threshold, freqs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Part 5: Error Analysis\n",
"\n",
"In this part you will see some tweets that your model missclassified. Why do you think the missclassifications happened? Were there any assumptions made by your naive bayes model?"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Truth Predicted Tweet\n",
"1\t0.00\tb'truli later move know queen bee upward bound movingonup'\n",
"@jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp\n",
"1\t0.00\tb'new report talk burn calori cold work harder warm feel better weather :p'\n",
"A new report talks about how we burn more calories in the cold, because we work harder to warm up. Feel any better about the weather? :p\n",
"1\t0.00\tb'harri niall 94 harri born ik stupid wanna chang :D'\n",
"Harry and niall and -94 (when harry was born) ik it's stupid and i wanna change it :D https://t.co/gHAt8ZDAfF\n",
"1\t0.00\tb'park get sunlight'\n",
"off to the park to get some sunlight : )\n",
"1\t0.00\tb'uff itna miss karhi thi ap :p'\n",
"@msarosh Uff Itna Miss karhy thy ap :p\n",
"0\t1.00\tb'hello info possibl interest jonatha close join beti :( great'\n",
"@rcdlccom hello, any info about possible interest in Jonathas ?? He is close to join Betis :( greatings\n",
"0\t1.00\tb'u prob fun david'\n",
"@phenomyoutube u probs had more fun with david than me : (\n",
"0\t1.00\tb'pat jay'\n",
"pats jay : (\n",
"0\t1.00\tb'sr financi analyst expedia inc bellevu wa financ expediajob job job hire'\n",
"Sr. Financial Analyst - Expedia, Inc.: (#Bellevue, WA) http://t.co/ktknMhvwCI #Finance #ExpediaJobs #Job #Jobs #Hiring\n"
]
}
],
"source": [
"# Some error analysis done for you\n",
"print('Truth Predicted Tweet')\n",
"for x, y in zip(test_x, test_y):\n",
" y_hat = naive_bayes_predict(x, logprior, loglikelihood)\n",
" if y != (np.sign(y_hat) > 0):\n",
" print('%d\\t%0.2f\\t%s\\n%s' % (y, np.sign(y_hat) > 0, ' '.join(\n",
" process_tweet(x)).encode('ascii', 'ignore'),x))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Part 6: Predict with your own tweet\n",
"\n",
"In this part you can predict the sentiment of your own tweet."
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9.568926233771123\n"
]
}
],
"source": [
"# Test with your own tweet - feel free to modify `my_tweet`\n",
"my_tweet = 'I am happy because I am learning :)'\n",
"\n",
"p = naive_bayes_predict(my_tweet, logprior, loglikelihood)\n",
"print(p)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Congratulations on completing this assignment. See you next week!"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

\n", "

- \n",
"
- Please use the `process_tweet` function that was imported above, and then store the words in their respective dictionaries and sets. \n", "
- You may find it useful to use the `zip` function to match each element in `tweets` with each element in `ys`. \n", "
- Remember to check if the key in the dictionary exists before adding that key to the dictionary, or incrementing its value. \n", "
- Assume that the `result` dictionary that is input will contain clean key-value pairs (you can assume that the values will be integers that can be incremented). It is good practice to check the datatype before incrementing the value, but it's not required here. \n", "

\n",
" Words\n",
" | \n", " Positive word count\n", " | \n", " Negative Word Count\n", " |

\n", " glad\n", " | \n", " 41\n", " | \n", " 2\n", " |

\n", " arriv\n", " | \n", " 57\n", " | \n", " 4\n", " |

\n", " :(\n", " | \n", " 1\n", " | \n", " 3663\n", " |

\n", " :-(\n", " | \n", " 0\n", " | \n", " 378\n", " |