"\n",
" req = urllib2.Request(url=review_url, headers=headers)\n",
" html = urllib2.urlopen(req).read()\n",
" soup = BeautifulSoup(html, \"lxml\")\n",
" #time.sleep(5)\n",
" reviews = soup.findAll(\"div\", class_=\"celwidget\")\n",
" max_limit += 1\n",
" print (len(reviews))\n",
" if len(reviews) == 0:\n",
"    pass\n",
" for review in reviews:\n",
"    rating = 1\n",
"    r1 = review.find(\"span\", class_=\"a-icon-alt\")\n",
"    if r1 is not None:\n",
"          r = r1.text\n",
"          if r == '5.0 out of 5 stars':\n",
"             rating = 5\n",
"          elif r == '4.0 out of 5 stars':\n",
"             rating = 4\n",
"          elif r == '3.0 out of 5 stars':\n",
"             rating = 3\n",
"          elif r == '2.0 out of 5 stars':\n",
"             rating = 2\n",
"          elif r == '1.0 out of 5 stars':\n",
"             rating = 1\n",
"\n",
"          r_text_cont = review.find(\"span\",\n",
"                                  [\"review-text\"])\n",
"\n",
"          if r_text_cont is not None:\n",
"             rev = r_text_cont.find(\"span\")\n",
"             r_text = rev.text\n",
"             df = df.append({'Rating': rating, 'Review': r_text}, ignore_index=True)\n",
"             #print(str(rating) + \" : \" + r_text)\n",
"             total_review += 1\n",
" page_count += 1\n",
"\n",
"data = df\n",
"data.Rating.replace([1.0, 2.0, 3.0, 4.0, 5.0], ['neg', 'neg', 'neg', 'pos', 'pos'], inplace=True)\n",
"\n",
"print (data.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def preprocess(x):\n",
" x = re.sub('[^\\w\\s]', '', x)\n",
" x = re.sub('[\\d]', '', x)\n",
" x = x.lower()\n",
" return x\n",
"\n",
"X = data['Review']\n",
"y = data['Rating']\n",
"\n",
"X = X.apply(preprocess)\n",
"\n",
"nltk.download('stopwords')\n",
"stop = stopwords.words('english')\n",
"\n",
"X = X.apply(lambda x: \" \".join(x for x in x.split() if x not in stop))\n",
"X = X.apply(lambda x: str(TextBlob(x).correct()))\n",
"\n",
"st = PorterStemmer()\n",
"X = X.apply(lambda x: \" \".join([st.stem(word) for word in x.split()]))\n",
"X = X.apply(lambda x: \" \".join([Word(word).lemmatize() for word in x.split()]))\n",
"\n",
"from nltk.tokenize import word_tokenize\n",
"def find_tuples(X, y):\n",
" train = []\n",
" for idx, val in enumerate(X):\n",
"    train.append((X[idx], y[idx]))\n",
"\n",
" words = set(word.lower() for p in train for word in word_tokenize(p[0]))\n",
" t = [({word: (word in word_tokenize(x[0])) for word in words}, x[1]) for x in train]\n",
" return t\n",
"\n",
"from nltk import bigrams, trigrams\n",
"\n",
"# The functions below will return a list of bigrams and trigrams\n",
"\n",
"def bigramReturner (X, y):\n",
" bigramFeatureVector = []\n",
" y_big = []\n",
" for idx, val in enumerate(X):\n",
"    for item in bigrams(val.split()):\n",
"          bigramFeatureVector.append(' '.join(item))\n",
"          y_big.append(y[idx])\n",
" return bigramFeatureVector, y_big\n",
"\n",
"def trigramReturner (X, y):\n",
" bigramFeatureVector = []\n",
" y_big = []\n",
" for idx, val in enumerate(X):\n",
"    for item in trigrams(val.split()):\n",
"          bigramFeatureVector.append(' '.join(item))\n",
"          y_big.append(y[idx])\n",
" return bigramFeatureVector, y_big\n",
"\n",
"\n",
"random.seed(1)\n",
"\n",
"def split_label_feats(x, split=0.75):\n",
" random.shuffle(x)\n",
" cutoff = int(len(x) * split)\n",
" x_train = x[:cutoff]\n",
" x_test = x[cutoff:]\n",
" return x_train, x_test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Naiye Bayes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#data_labels = find_tuples(X_new, y_new)\n",
"data_labels = find_tuples(X, y)\n",
"\n",
"# Splitting data into training and test set 70/30\n",
"\n",
"\n",
"X_train, X_test = split_label_feats(data_labels, split=0.70)\n",
"\n",
"\n",
"# Developing Naive Bayes classifier\n

1. Scrape 500 reviews each from Amazon for the following products:...