Code – Digital Scholarship Projects, CUHK Library

import pandas as pd
import networkx as nx
from pyvis.network import Network
import streamlit as st
import streamlit.components.v1 as components
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os
from snownlp import SnowNLP
import platform
import re
import jieba
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from collections import Counter


# Load data
@st.cache_data
def load_data():
    data = pd.read_csv('/Users/yuchang/Documents/CUHK/Lesson/project/dataset.csv')  # Assume the data is stored in a CSV file
    data['created_at'] = pd.to_datetime(data['created_at'])  # Convert time format
    # Clean parentID and currentID columns
    data['parentID'] = data['parentID'].fillna(-1).astype(int)  # Replace NaN with -1 and convert to integer
    data['currentID'] = data['currentID'].fillna(-1).astype(int)  # Same as above
    return data


def clean_content(content):
    """
    Clean Weibo content:
    - Remove emojis (e.g., [Broken], 😊).
    - Remove special symbols (e.g., #, @, ...).
    - Remove extra spaces.
    - Return cleaned text.
    """
    if not isinstance(content, str) or not content.strip():
        return ""  # If the content is empty or not a string, return an empty string
    # Remove emojis (e.g., [Broken])
    content = re.sub(r'\[.*?\]', '', content)
    # Remove special symbols (e.g., #, @, ...)
    content = re.sub(r'[^\w\s。，！？]', '', content)
    # Remove extra spaces
    content = content.strip()
    return content


# Load stopword list
def load_stopwords():
    """
    Load stopword list.
    """
    stopwords = set()
    with open('/Users/yuchang/Documents/CUHK/Lesson/project/cn_stopwords.txt', 'r', encoding='utf-8') as f:
        for line in f:
            stopwords.add(line.strip())
    return stopwords


stopwords = load_stopwords()
data = load_data()


# Load BERT model and tokenizer
@st.cache_resource
def load_bert_model():
    tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')
    model = BertForSequenceClassification.from_pretrained('hfl/chinese-bert-wwm-ext')
    model.eval()
    return tokenizer, model


tokenizer, model = load_bert_model()


# tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')
# model = BertForSequenceClassification.from_pretrained('hfl/chinese-bert-wwm-ext')
def bert_sentiment_analysis(text):
    """
    Perform sentiment analysis using the BERT model.
    Returns: Sentiment score (0 indicates negative, 1 indicates positive).
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)
    sentiment_score = probs[:, 1].item()  # Positive probability
    return sentiment_score


# Title and description
st.title("Weibo Rumor Propagation Path Dynamic Visualization")
st.write("Please enter `postID` to view the corresponding propagation path, content word cloud, and other information.")
# User input postID
postID = st.text_input("Please enter postID:", value="4738231273522828")
# Filter data
if postID:
    try:
        post_data = data[data['postID'] == int(postID)]
        if post_data.empty:
            st.error(f"No data found for postID {postID}!")
        else:
            # Display rumor and category
            rumor_status = "Non-Rumor" if post_data['rumor'].iloc[0] == False else "Rumor"
            category = post_data['category'].iloc[0]
            st.write(f"**Rumor Status:** {rumor_status}")
            st.write(f"**Category:** {category}")

            # Build propagation network
            G = nx.DiGraph()
            node_content = {}  # Store node content
            for _, row in post_data.iterrows():
                parentID = int(row['parentID'])
                currentID = int(row['currentID'])
                # Add edge to graph
                G.add_edge(parentID, currentID)
                # Store node content
                cleaned_content = clean_content(row['content'])  # Clean content
                node_content[currentID] = cleaned_content
                if parentID not in node_content:
                    node_content[parentID] = ""  # If parent node has no content, default to empty

            # Use PyVis to create dynamic graph
            net = Network(notebook=True, cdn_resources='remote', height="600px", width="100%", bgcolor="#ffffff",
                          font_color="white")
            net.from_nx(G)

            # Add title for each node (display content on click)
            for node in net.nodes:
                node_id = node['id']
                content = node_content.get(node_id, "No content available")
                node['title'] = f"Content: {content}"  # Set hover display content

            # Set layout options
            net.force_atlas_2based(gravity=-50, central_gravity=0.01, spring_length=100, spring_strength=0.05,
                                   damping=0.4, overlap=0)
            # Save as HTML file
            html_file = f"rumor_network_{postID}.html"
            net.save_graph(html_file)
            # Display dynamic graph in Streamlit
            with open(html_file, 'r', encoding='utf-8') as f:
                html_content = f.read()
            st.write("**Propagation Path Dynamic Graph:**")
            components.html(html_content, height=600)

            # Generate word cloud
            all_content = " ".join(
                [str(content).strip() for content in node_content.values() if
                 isinstance(content, str) and content.strip()]
            )
            if all_content:
                # Tokenize
                words = jieba.lcut(all_content)  # Use jieba for tokenization
                # Remove stopwords
                filtered_words = [word for word in words if
                                  word not in stopwords and len(word) > 1]  # Remove single-character words

                # word_text = " ".join(filtered_words)  # Concatenate filtered tokens into a string
                # Count word frequency
                word_freq = Counter(filtered_words)
                # Select font path based on operating system
                system = platform.system()
                if system == "Darwin":  # macOS
                    font_path = "/System/Library/Fonts/STHeiti Light.ttc"
                elif system == "Windows":  # Windows
                    font_path = "C:/Windows/Fonts/simhei.ttf"
                else:  # Linux or other systems
                    font_path = "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc"  # Common open-source Chinese font
                
                wordcloud = WordCloud(
                    font_path=font_path,
                    width=800,
                    height=400,
                    background_color='white'
                ).generate_from_frequencies(word_freq)
                # Plot word cloud
                plt.figure(figsize=(10, 5))
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis('off')
                plt.title(f"Word Cloud for PostID: {postID}")
                # Display word cloud in Streamlit
                st.write("**Propagation Content Word Cloud:**")
                st.pyplot(plt)
            else:
                st.write("No content available for the current postID to generate a word cloud.")
            # Sentiment analysis
            sentiment_results = []
            for node_id, content in node_content.items():
                if isinstance(content, str) and content.strip():
                    sentiment_score = bert_sentiment_analysis(content)  # Use BERT model
                    sentiment_label = "Positive" if sentiment_score > 0.6 else (
                        "Negative" if sentiment_score < 0.4 else "Neutral")
                    sentiment_results.append(
                        {"Node ID": node_id, "Content": content, "Sentiment Score": sentiment_score,
                         "Sentiment Label": sentiment_label})
            if sentiment_results:
                # Convert to DataFrame
                sentiment_df = pd.DataFrame(sentiment_results)
                # Display table
                st.write("**Sentiment Analysis Results for Each Comment:**")
                st.dataframe(sentiment_df)
                # Calculate overall sentiment score
                avg_sentiment = sentiment_df["Sentiment Score"].mean()
                overall_sentiment = "Positive" if avg_sentiment > 0.6 else (
                    "Negative" if avg_sentiment < 0.4 else "Neutral")
                st.write(f"**Average Sentiment Score:** {avg_sentiment:.2f}")
                st.write(f"**Overall Sentiment:** {overall_sentiment}")
                # Plot sentiment distribution histogram
                plt.figure(figsize=(10, 5))
                plt.hist(sentiment_df["Sentiment Score"], bins=20, color='purple', alpha=0.7)
                plt.title("Sentiment Distribution of Posts")
                plt.xlabel("Sentiment Score (0 = Negative, 1 = Positive)")
                plt.ylabel("Frequency")
                st.write("**Sentiment Distribution Histogram:**")
                st.pyplot(plt)
            else:
                st.write("No content available for the current postID for sentiment analysis.")

            # Sentiment analysis: Daily average sentiment score
            daily_sentiment = []
            for date, group in post_data.groupby(post_data['created_at'].dt.date):
                sentiments = []
                for content in group['content']:
                    cleaned_content = clean_content(content)  # Clean content
                    if cleaned_content.strip():
                        # s = SnowNLP(cleaned_content)
                        # sentiments.append(s.sentiments)
                        sentiment_score = bert_sentiment_analysis(cleaned_content)  # Use BERT model
                        sentiments.append(sentiment_score)
                if sentiments:
                    avg_daily_sentiment = sum(sentiments) / len(sentiments)
                    daily_sentiment.append({"Date": date, "Average Sentiment": avg_daily_sentiment})
            if daily_sentiment:
                # Convert to DataFrame
                daily_sentiment_df = pd.DataFrame(daily_sentiment)
                daily_sentiment_df.set_index("Date", inplace=True)
                # Find the date with the lowest average score
                lowest_date = daily_sentiment_df["Average Sentiment"].idxmin()
                lowest_avg_sentiment = daily_sentiment_df.loc[lowest_date, "Average Sentiment"]
                st.write(
                    f"**Date with Lowest Average Sentiment Score:** {lowest_date} (Score: {lowest_avg_sentiment:.2f})")
                # Extract comments and scores for that day
                lowest_date_comments = post_data[post_data['created_at'].dt.date == lowest_date]
                lowest_date_sentiments = []
                for _, row in lowest_date_comments.iterrows():
                    cleaned_content = clean_content(row['content'])  # Clean content
                    if cleaned_content.strip():
                        s = SnowNLP(cleaned_content)
                        sentiment_score = s.sentiments
                        sentiment_label = "Positive" if sentiment_score > 0.6 else (
                            "Negative" if sentiment_score < 0.4 else "Neutral")
                        lowest_date_sentiments.append({"Content": cleaned_content, "Sentiment Score": sentiment_score,
                                                       "Sentiment Label": sentiment_label})
                if lowest_date_sentiments:
                    lowest_date_sentiments_df = pd.DataFrame(lowest_date_sentiments)
                    st.write(f"**Comments and Scores for {lowest_date}:**")
                    st.dataframe(lowest_date_sentiments_df)
                else:
                    st.write(f"No comments available for {lowest_date}.")
                # Plot daily sentiment score line chart
                plt.figure(figsize=(12, 6))
                plt.plot(daily_sentiment_df.index, daily_sentiment_df["Average Sentiment"], marker='o', color='orange')
                plt.title("Daily Average Sentiment Score")
                plt.xlabel("Date")
                plt.ylabel("Average Sentiment Score")
                plt.grid()
                st.write("**Daily Sentiment Score Line Chart:**")
                st.pyplot(plt)
            else:
                st.write("No content available for the current postID for daily sentiment analysis.")
    except ValueError:
        st.error("Please enter a valid numeric postID!")