import pandas as pd
import networkx as nx
from pyvis.network import Network
import streamlit as st
import streamlit.components.v1 as components
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os
from snownlp import SnowNLP
import platform
import re
import jieba
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from collections import Counter
# Load data
@st.cache_data
def load_data():
data = pd.read_csv('/Users/yuchang/Documents/CUHK/Lesson/project/dataset.csv') # Assume the data is stored in a CSV file
data['created_at'] = pd.to_datetime(data['created_at']) # Convert time format
# Clean parentID and currentID columns
data['parentID'] = data['parentID'].fillna(-1).astype(int) # Replace NaN with -1 and convert to integer
data['currentID'] = data['currentID'].fillna(-1).astype(int) # Same as above
return data
def clean_content(content):
"""
Clean Weibo content:
- Remove emojis (e.g., [Broken], 😊).
- Remove special symbols (e.g., #, @, ...).
- Remove extra spaces.
- Return cleaned text.
"""
if not isinstance(content, str) or not content.strip():
return "" # If the content is empty or not a string, return an empty string
# Remove emojis (e.g., [Broken])
content = re.sub(r'\[.*?\]', '', content)
# Remove special symbols (e.g., #, @, ...)
content = re.sub(r'[^\w\s。,!?]', '', content)
# Remove extra spaces
content = content.strip()
return content
# Load stopword list
def load_stopwords():
"""
Load stopword list.
"""
stopwords = set()
with open('/Users/yuchang/Documents/CUHK/Lesson/project/cn_stopwords.txt', 'r', encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
return stopwords
stopwords = load_stopwords()
data = load_data()
# Load BERT model and tokenizer
@st.cache_resource
def load_bert_model():
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')
model = BertForSequenceClassification.from_pretrained('hfl/chinese-bert-wwm-ext')
model.eval()
return tokenizer, model
tokenizer, model = load_bert_model()
# tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')
# model = BertForSequenceClassification.from_pretrained('hfl/chinese-bert-wwm-ext')
def bert_sentiment_analysis(text):
"""
Perform sentiment analysis using the BERT model.
Returns: Sentiment score (0 indicates negative, 1 indicates positive).
"""
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=-1)
sentiment_score = probs[:, 1].item() # Positive probability
return sentiment_score
# Title and description
st.title("Weibo Rumor Propagation Path Dynamic Visualization")
st.write("Please enter `postID` to view the corresponding propagation path, content word cloud, and other information.")
# User input postID
postID = st.text_input("Please enter postID:", value="4738231273522828")
# Filter data
if postID:
try:
post_data = data[data['postID'] == int(postID)]
if post_data.empty:
st.error(f"No data found for postID {postID}!")
else:
# Display rumor and category
rumor_status = "Non-Rumor" if post_data['rumor'].iloc[0] == False else "Rumor"
category = post_data['category'].iloc[0]
st.write(f"**Rumor Status:** {rumor_status}")
st.write(f"**Category:** {category}")
# Build propagation network
G = nx.DiGraph()
node_content = {} # Store node content
for _, row in post_data.iterrows():
parentID = int(row['parentID'])
currentID = int(row['currentID'])
# Add edge to graph
G.add_edge(parentID, currentID)
# Store node content
cleaned_content = clean_content(row['content']) # Clean content
node_content[currentID] = cleaned_content
if parentID not in node_content:
node_content[parentID] = "" # If parent node has no content, default to empty
# Use PyVis to create dynamic graph
net = Network(notebook=True, cdn_resources='remote', height="600px", width="100%", bgcolor="#ffffff",
font_color="white")
net.from_nx(G)
# Add title for each node (display content on click)
for node in net.nodes:
node_id = node['id']
content = node_content.get(node_id, "No content available")
node['title'] = f"Content: {content}" # Set hover display content
# Set layout options
net.force_atlas_2based(gravity=-50, central_gravity=0.01, spring_length=100, spring_strength=0.05,
damping=0.4, overlap=0)
# Save as HTML file
html_file = f"rumor_network_{postID}.html"
net.save_graph(html_file)
# Display dynamic graph in Streamlit
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
st.write("**Propagation Path Dynamic Graph:**")
components.html(html_content, height=600)
# Generate word cloud
all_content = " ".join(
[str(content).strip() for content in node_content.values() if
isinstance(content, str) and content.strip()]
)
if all_content:
# Tokenize
words = jieba.lcut(all_content) # Use jieba for tokenization
# Remove stopwords
filtered_words = [word for word in words if
word not in stopwords and len(word) > 1] # Remove single-character words
# word_text = " ".join(filtered_words) # Concatenate filtered tokens into a string
# Count word frequency
word_freq = Counter(filtered_words)
# Select font path based on operating system
system = platform.system()
if system == "Darwin": # macOS
font_path = "/System/Library/Fonts/STHeiti Light.ttc"
elif system == "Windows": # Windows
font_path = "C:/Windows/Fonts/simhei.ttf"
else: # Linux or other systems
font_path = "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc" # Common open-source Chinese font
wordcloud = WordCloud(
font_path=font_path,
width=800,
height=400,
background_color='white'
).generate_from_frequencies(word_freq)
# Plot word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f"Word Cloud for PostID: {postID}")
# Display word cloud in Streamlit
st.write("**Propagation Content Word Cloud:**")
st.pyplot(plt)
else:
st.write("No content available for the current postID to generate a word cloud.")
# Sentiment analysis
sentiment_results = []
for node_id, content in node_content.items():
if isinstance(content, str) and content.strip():
sentiment_score = bert_sentiment_analysis(content) # Use BERT model
sentiment_label = "Positive" if sentiment_score > 0.6 else (
"Negative" if sentiment_score < 0.4 else "Neutral")
sentiment_results.append(
{"Node ID": node_id, "Content": content, "Sentiment Score": sentiment_score,
"Sentiment Label": sentiment_label})
if sentiment_results:
# Convert to DataFrame
sentiment_df = pd.DataFrame(sentiment_results)
# Display table
st.write("**Sentiment Analysis Results for Each Comment:**")
st.dataframe(sentiment_df)
# Calculate overall sentiment score
avg_sentiment = sentiment_df["Sentiment Score"].mean()
overall_sentiment = "Positive" if avg_sentiment > 0.6 else (
"Negative" if avg_sentiment < 0.4 else "Neutral")
st.write(f"**Average Sentiment Score:** {avg_sentiment:.2f}")
st.write(f"**Overall Sentiment:** {overall_sentiment}")
# Plot sentiment distribution histogram
plt.figure(figsize=(10, 5))
plt.hist(sentiment_df["Sentiment Score"], bins=20, color='purple', alpha=0.7)
plt.title("Sentiment Distribution of Posts")
plt.xlabel("Sentiment Score (0 = Negative, 1 = Positive)")
plt.ylabel("Frequency")
st.write("**Sentiment Distribution Histogram:**")
st.pyplot(plt)
else:
st.write("No content available for the current postID for sentiment analysis.")
# Sentiment analysis: Daily average sentiment score
daily_sentiment = []
for date, group in post_data.groupby(post_data['created_at'].dt.date):
sentiments = []
for content in group['content']:
cleaned_content = clean_content(content) # Clean content
if cleaned_content.strip():
# s = SnowNLP(cleaned_content)
# sentiments.append(s.sentiments)
sentiment_score = bert_sentiment_analysis(cleaned_content) # Use BERT model
sentiments.append(sentiment_score)
if sentiments:
avg_daily_sentiment = sum(sentiments) / len(sentiments)
daily_sentiment.append({"Date": date, "Average Sentiment": avg_daily_sentiment})
if daily_sentiment:
# Convert to DataFrame
daily_sentiment_df = pd.DataFrame(daily_sentiment)
daily_sentiment_df.set_index("Date", inplace=True)
# Find the date with the lowest average score
lowest_date = daily_sentiment_df["Average Sentiment"].idxmin()
lowest_avg_sentiment = daily_sentiment_df.loc[lowest_date, "Average Sentiment"]
st.write(
f"**Date with Lowest Average Sentiment Score:** {lowest_date} (Score: {lowest_avg_sentiment:.2f})")
# Extract comments and scores for that day
lowest_date_comments = post_data[post_data['created_at'].dt.date == lowest_date]
lowest_date_sentiments = []
for _, row in lowest_date_comments.iterrows():
cleaned_content = clean_content(row['content']) # Clean content
if cleaned_content.strip():
s = SnowNLP(cleaned_content)
sentiment_score = s.sentiments
sentiment_label = "Positive" if sentiment_score > 0.6 else (
"Negative" if sentiment_score < 0.4 else "Neutral")
lowest_date_sentiments.append({"Content": cleaned_content, "Sentiment Score": sentiment_score,
"Sentiment Label": sentiment_label})
if lowest_date_sentiments:
lowest_date_sentiments_df = pd.DataFrame(lowest_date_sentiments)
st.write(f"**Comments and Scores for {lowest_date}:**")
st.dataframe(lowest_date_sentiments_df)
else:
st.write(f"No comments available for {lowest_date}.")
# Plot daily sentiment score line chart
plt.figure(figsize=(12, 6))
plt.plot(daily_sentiment_df.index, daily_sentiment_df["Average Sentiment"], marker='o', color='orange')
plt.title("Daily Average Sentiment Score")
plt.xlabel("Date")
plt.ylabel("Average Sentiment Score")
plt.grid()
st.write("**Daily Sentiment Score Line Chart:**")
st.pyplot(plt)
else:
st.write("No content available for the current postID for daily sentiment analysis.")
except ValueError:
st.error("Please enter a valid numeric postID!")