In [1]:

!pip install pandas nltk scikit-learn matplotlib wordcloud

Requirement already satisfied: pandas in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (2.2.2)
Requirement already satisfied: nltk in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (3.8.1)
Requirement already satisfied: scikit-learn in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (1.5.0)
Requirement already satisfied: matplotlib in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (3.9.0)
Requirement already satisfied: wordcloud in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (1.9.3)
Requirement already satisfied: numpy>=1.26.0 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\alixp\appdata\roaming\python\python312\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: click in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from nltk) (8.1.7)
Requirement already satisfied: joblib in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from nltk) (1.4.2)
Requirement already satisfied: regex>=2021.8.3 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from nltk) (2024.5.15)
Requirement already satisfied: tqdm in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from nltk) (4.66.4)
Requirement already satisfied: scipy>=1.6.0 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from scikit-learn) (3.5.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from matplotlib) (1.2.1)
Requirement already satisfied: cycler>=0.10 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from matplotlib) (4.53.0)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from matplotlib) (1.4.5)
Requirement already satisfied: packaging>=20.0 in c:\users\alixp\appdata\roaming\python\python312\site-packages (from matplotlib) (24.0)
Requirement already satisfied: pillow>=8 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from matplotlib) (10.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\alixp\anaconda3\envs\nlp\lib\site-packages (from matplotlib) (3.1.2)
Requirement already satisfied: six>=1.5 in c:\users\alixp\appdata\roaming\python\python312\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Requirement already satisfied: colorama in c:\users\alixp\appdata\roaming\python\python312\site-packages (from click->nltk) (0.4.6)

In [2]:

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [3]:

# Download necessary NLTK packages
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alixp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alixp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Out[3]:

True

In [4]:

# Load the data from CSV file
# DATA SOURCE: https://www.kaggle.com/datasets/fahadrehman07/voices-of-history-50-famous-speeches/
file_path = 'C:/Users/alixp/OneDrive/CODES/Python/NOTEBOOKS/50 Famous Speechs.csv'  
df = pd.read_csv(file_path, encoding='ANSI')

In [5]:

# Clean the texts
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    return text

In [6]:

df['Cleaned_Speech'] = df['Speech'].apply(clean_text)

In [7]:

# Remove stopwords and perform lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:

def preprocess_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [9]:

df['Preprocessed_Speech'] = df['Cleaned_Speech'].apply(preprocess_text)

In [10]:

# Vectorize the texts
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['Preprocessed_Speech'])

In [11]:

# LDA Model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

Out[11]:

LatentDirichletAllocation(n_components=5, random_state=42)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

LatentDirichletAllocation?Documentation for LatentDirichletAllocationiFitted

LatentDirichletAllocation(n_components=5, random_state=42)

In [12]:

# Display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [13]:

no_top_words = 10
tf_feature_names = vectorizer.get_feature_names_out()
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
party shall people know fear great world day leadership general
Topic 1:
people day woman hope freedom like nation black faith let
Topic 2:
woman life human power make future law say people state
Topic 3:
war world rise like let country hope ask history million
Topic 4:
people right shall peace life men struggle freedom woman let

In [14]:

# Word Cloud Visualization
for idx, topic in enumerate(lda.components_):
    plt.figure()
    plt.imshow(WordCloud(background_color='white').fit_words(dict(zip(tf_feature_names, topic))))
    plt.axis('off')
    plt.title(f'Topic {idx}')
    plt.show()

No description has been provided for this image

In [15]:

# Topic Distribution in Documents
topic_dist = lda.transform(dtm)

for i in range(len(df)):
    if i >= 10:
        break
        
    plt.figure()
    plt.bar(range(len(topic_dist[i])), topic_dist[i])
    plt.xlabel('Topic')
    plt.ylabel('Proportion')
    plt.title(f'Speech {df["Title of the Speech"][i]}')
    plt.show()

Re: Voices of History: 40 Famous Speeches

Alix Paulino - penktadienis, 2024 birželio 14, 02:34

I decided to complete the task using Python instead of R because I encountered several issues while trying to work with R. Python provided a more seamless experience for data preprocessing, topic modeling with LDA, and visualization of the results. Here is a detailed explanation of the steps I followed to achieve the analysis:

Choosing the Corpus:
I used the dataset "50 Famous Speeches.csv" for the topic modeling analysis.

Cleaning the Texts:
I cleaned the texts by removing formatting and any HTML tags to retain only the raw text.

Loading the Texts into a CSV File:
The speeches were already in a CSV file, where the first column is the document ID and the second column contains the speech content.

Text Preprocessing:
I used Python's NLTK library for text preprocessing. This involved removing stop words, performing lemmatization, and tokenizing the text. I also handled special characters to ensure the texts were properly cleaned.

Model Calculation:
I used the scikit-learn library to vectorize the texts and build the LDA model. I chose appropriate parameters for the model to identify key topics in the speeches.

Visualization of Results:
I used Matplotlib and WordCloud libraries to visualize the results. This included generating word clouds for each topic and creating a distribution of topics across the speeches using subplots to organize the charts better.

Best regards,
Alix Paulino