본문 바로가기

텍스트마이닝

감성분석 Sentiment Analysis - [Amazon Sales Data]

감성 분석(Sentiment Analysis)은 텍스트에 나온 감정을 분석하는 자연어 처리(NLP)의 접근 방식입니다. 오늘 rating별로 리뷰를 positive/negative으로 나누어서 분석을 진행할것이고, 그 후에 모델링으로 성능을 평가 하도록 하겠습니다.

 

1. Importing libraries 

import pandas as pd
import numpy as np
from cleantext import clean
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import collections
import matplotlib.pyplot as plt
import seaborn as sns

# Random Forest
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Support Vector Machine (SVM)
from sklearn.svm import SVC

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

#Neural Network
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split

 

2. Reading the data

df = pd.read_csv("amazon_sales.csv")
df.head()
df.shape

 

3. Labeling

'rating' 값이 4.1보다 높은 리뷰를 positive, 낮은 평점을 negative을 대표하는 1과 0으로 구분하여 'label'이라는 이름의 열에 쓴다.

df['label'] = df['rating'].apply(lambda x: 1 if float(x) >= 4.1 else 0)
df['label'].value_counts()

 

Visualization of labels' distribution
labels = ['positive', 'negative']
values = df['label'].value_counts()

plt.bar(labels, values)

plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Label Counts')

plt.show()

 

Sentiment Distribution

4. Data Pre-processing [전처리]

Cleaning data by using cleantext package
df['review_cleaned'] = df['review_content'].apply(lambda x: clean(x, 
                                                            fix_unicode=True, 
                                                            to_ascii=True, 
                                                            lower=True, 
                                                            no_urls=True, 
                                                            no_emails=True,
                                                            no_phone_numbers=True, 
                                                            no_numbers=True,
                                                            no_digits=True, 
                                                            no_currency_symbols=True,
                                                            no_punct=True, 
                                                            lang="en"))
import re

def clean_extra(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\+', ' ', text)
    return text
Removing emojis and stopwords
def remove_emojis(text):
    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # Emoticons
                            u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                            u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                            u"\U0001F700-\U0001F77F"  # Alphanumerics
                            u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                            u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                            u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                            u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                            u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                            u"\U00002702-\U000027B0"  # Dingbats
                            u"\U000024C2-\U0001F251" 
                            "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r"", text)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_nltk = set(stopwords.words('english'))

def remove_stopwords(text):   
    text_data_without_stopwords = " ".join([word for word in text.split() if word.lower() not in stopwords_nltk])
    return text_data_without_stopwords
df['review_cleaned'] = df['review_cleaned'].apply(remove_emojis)
df['review_cleaned'] = df['review_cleaned'].apply(clean_extra)
df['review_cleaned'] = df['review_cleaned'].apply(remove_stopwords)

 

5. Vectorization

Using CountVectorizer, looking at word frequencies.
cv = CountVectorizer()
bow = cv.fit_transform(df.review_cleaned)
word_freq = dict(zip(cv.get_feature_names_out(), np.asarray(bow.sum(axis=0)).ravel()))
word_counter = collections.Counter(word_freq)
word_counter_df = pd.DataFrame(word_counter.most_common(20), columns = ['word', 'freq'])
fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(x="word", y="freq", data=word_counter_df, palette="PuBuGn_d", ax=ax)
plt.rcParams.update({'font.size': 8})
plt.xticks(rotation=45)
plt.show();

 

Word Frequency

Using TF-IDF Vectorizer
tfidf = TfidfVectorizer()
corpus = tfidf.fit_transform(df.review_cleaned)

 

6. Performance Evaluation

# Splitting the features into train and test

X_train, X_test, y_train, y_test = 
train_test_split(corpus, df['label'], test_size=0.2, random_state=2001)
# Performance computing function

def compute_performance(name, model, X_test, y_test):
    # Importing all the metrics
    from sklearn.metrics import (
        confusion_matrix,
        accuracy_score,
        precision_score,
        recall_score,
        f1_score
    )

    # Predict
    model = model.fit(X_train, y_train)
    pred = model.predict(X_test) 

    # Calculate measures
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1score = f1_score(y_test, pred)
    return [name, accuracy, precision, recall, f1score]

 

Employing four different models to measure the performances
names = [
    "RandomForestClassifier",
    "RBF SVM",
    "Decision Tree",
    "Neural Net"
]


classifiers = [
    RandomForestClassifier(n_estimators=500, max_depth=20, random_state=50),
    SVC(gamma=2, C=1, random_state=50),
    DecisionTreeClassifier(max_depth=5, random_state=50),
    MLPClassifier(alpha=1, max_iter=1000, random_state=50)
]


# Iterating over classifiers

# Creating a scores dataframe with column names
scores = pd.DataFrame(columns=['Name', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
for name, clf in zip(names, classifiers):
    rs = compute_performance(name, clf, X_test, y_test)
    scores.loc[len(scores)] = rs

scores.head()

 

Results