 # 1. Scrape 500 reviews each from Amazon for the following products: ...

## Question

1. Scrape 500 reviews each from Amazon for the following products:
Link 1: Molly's Suds Original Laundry Detergent Powder, Bundle of 2 (500 REVIEWS)
Link 2: OxiClean Versatile Stain Remover Powder, 7.22 lbs (500 REVIEWS)
2. Preprocess review data by removing punctuation, replacing contractions, removing stop-words, and returning lower-case text
3. Re-code rating data as either positive (4 or 5 stars) or negative (1 to 3 stars)
4. Split the data 70/30 and build four different models:
a. Naïve Bayes
b. Decision Tree
c. Logistic Regression
d. Support Vector Machine (SVM)
5. Use four different feature sets:
a. Uni-grams
b. Tri-grams
c. Bi-grams
d. Combination of all three
6. Summarizer performance metrics by providing a table for all models
7. Identify the top keywords for both positive and negative reviews
8. Identify which combination of features and model produces the best negative and positive recall

## Solution Preview

"\n",
"    html = urllib2.urlopen(req).read()\n",
"    soup = BeautifulSoup(html, \"lxml\")\n",
"    #time.sleep(5)\n",
"    reviews = soup.findAll(\"div\", class_=\"celwidget\")\n",
"    max_limit += 1\n",
"    print (len(reviews))\n",
"    if len(reviews) == 0:\n",
"       pass\n",
"    for review in reviews:\n",
"       rating = 1\n",
"       r1 = review.find(\"span\", class_=\"a-icon-alt\")\n",
"       if r1 is not None:\n",
"            r = r1.text\n",
"            if r == '5.0 out of 5 stars':\n",
"                rating = 5\n",
"            elif r == '4.0 out of 5 stars':\n",
"                rating = 4\n",
"            elif r == '3.0 out of 5 stars':\n",
"                rating = 3\n",
"            elif r == '2.0 out of 5 stars':\n",
"                rating = 2\n",
"            elif r == '1.0 out of 5 stars':\n",
"                rating = 1\n",
"\n",
"            r_text_cont = review.find(\"span\",\n",
"                                     [\"review-text\"])\n",
"\n",
"            if r_text_cont is not None:\n",
"                rev = r_text_cont.find(\"span\")\n",
"                r_text = rev.text\n",
"                df = df.append({'Rating': rating, 'Review': r_text}, ignore_index=True)\n",
"                #print(str(rating) + \" : \" + r_text)\n",
"                total_review += 1\n",
"    page_count += 1\n",
"\n",
"data = df\n",
"data.Rating.replace([1.0, 2.0, 3.0, 4.0, 5.0], ['neg', 'neg', 'neg', 'pos', 'pos'], inplace=True)\n",
"\n",
"print (data.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"def preprocess(x):\n",
"    x = re.sub('[^\\w\\s]', '', x)\n",
"    x = re.sub('[\\d]', '', x)\n",
"    x = x.lower()\n",
"    return x\n",
"\n",
"X = data['Review']\n",
"y = data['Rating']\n",
"\n",
"X = X.apply(preprocess)\n",
"\n",
"stop = stopwords.words('english')\n",
"\n",
"X = X.apply(lambda x: \" \".join(x for x in x.split() if x not in stop))\n",
"X = X.apply(lambda x: str(TextBlob(x).correct()))\n",
"\n",
"st = PorterStemmer()\n",
"X = X.apply(lambda x: \" \".join([st.stem(word) for word in x.split()]))\n",
"X = X.apply(lambda x: \" \".join([Word(word).lemmatize() for word in x.split()]))\n",
"\n",
"from nltk.tokenize import word_tokenize\n",
"def find_tuples(X, y):\n",
"    train = []\n",
"    for idx, val in enumerate(X):\n",
"       train.append((X[idx], y[idx]))\n",
"\n",
"    words = set(word.lower() for p in train for word in word_tokenize(p))\n",
"    t = [({word: (word in word_tokenize(x)) for word in words}, x) for x in train]\n",
"    return t\n",
"\n",
"from nltk import bigrams, trigrams\n",
"\n",
"# The functions below will return a list of bigrams and trigrams\n",
"\n",
"def bigramReturner (X, y):\n",
"    bigramFeatureVector = []\n",
"    y_big = []\n",
"    for idx, val in enumerate(X):\n",
"       for item in bigrams(val.split()):\n",
"            bigramFeatureVector.append(' '.join(item))\n",
"            y_big.append(y[idx])\n",
"    return bigramFeatureVector, y_big\n",
"\n",
"def trigramReturner (X, y):\n",
"    bigramFeatureVector = []\n",
"    y_big = []\n",
"    for idx, val in enumerate(X):\n",
"       for item in trigrams(val.split()):\n",
"            bigramFeatureVector.append(' '.join(item))\n",
"            y_big.append(y[idx])\n",
"    return bigramFeatureVector, y_big\n",
"\n",
"\n",
"random.seed(1)\n",
"\n",
"def split_label_feats(x, split=0.75):\n",
"    random.shuffle(x)\n",
"    cutoff = int(len(x) * split)\n",
"    x_train = x[:cutoff]\n",
"    x_test = x[cutoff:]\n",
"    return x_train, x_test"
]
},
{
"cell_type": "markdown",
"source": [
"# Naiye Bayes"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"#data_labels = find_tuples(X_new, y_new)\n",
"data_labels = find_tuples(X, y)\n",
"\n",
"# Splitting data into training and test set 70/30\n",
"\n",
"\n",
"X_train, X_test = split_label_feats(data_labels, split=0.70)\n",
"\n",
"\n",
"# Developing Naive Bayes classifier\n...

