Spaces:

KavinduHansaka
/

Toxic_Comment_Classifier

Sleeping

App Files Files Community

Toxic_Comment_Classifier / app.py

KavinduHansaka

Update app.py

e2f6307 verified 4 months ago

raw

history blame contribute delete

7.48 kB

	import os
	import tempfile
	from pathlib import Path
	from typing import List
	import numpy as np

	import gradio as gr
	import pandas as pd
	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import docx
	import matplotlib.pyplot as plt

	try:
	import fitz # PyMuPDF
	except ImportError as e:
	raise ImportError("Missing dependency: PyMuPDF") from e


	# =========================
	# CPU OPTIMIZATION
	# =========================
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	torch.set_num_threads(2)
	torch.set_grad_enabled(False)

	# =========================
	# CONFIGURATION
	# =========================
	MODEL_NAME = "openai-community/roberta-base-openai-detector"
	AI_THRESHOLD = 0.5
	MAX_LENGTH = 256
	BATCH_SIZE = 8
	DEVICE = "cpu"

	SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"}

	# =========================
	# MODEL LOADING (ONCE)
	# =========================
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
	model.to(DEVICE)
	model.eval()


	# =========================
	# FILE LOADERS
	# =========================
	def load_text_from_file(file_path: str) -> str:
	path = Path(file_path)

	if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
	raise ValueError(f"Unsupported file type: {path.suffix}")

	if path.suffix == ".txt":
	return path.read_text(encoding="utf-8", errors="ignore")

	if path.suffix == ".pdf":
	text = []
	with fitz.open(path) as pdf:
	for page in pdf:
	text.append(page.get_text())
	return "\n".join(text)

	if path.suffix == ".docx":
	document = docx.Document(path)
	return "\n".join(p.text for p in document.paragraphs if p.text.strip())


	# =========================
	# TEXT UTILITIES
	# =========================
	def chunk_text(text: str, max_words: int = 200) -> List[str]:
	words = text.split()
	chunks = []

	for i in range(0, len(words), max_words):
	chunk = " ".join(words[i:i + max_words])
	if len(chunk.split()) >= 20:
	chunks.append(chunk)

	return chunks


	# =========================
	# CONFIDENCE CALIBRATION
	# =========================
	def calibrate_confidence(prob: float) -> str:
	distance = abs(prob - AI_THRESHOLD)
	if distance >= 0.35:
	return "High"
	elif distance >= 0.15:
	return "Medium"
	return "Low"


	# =========================
	# AI DETECTION (BATCHED)
	# =========================
	@torch.no_grad()
	def detect_ai_probability(texts: List[str], progress=gr.Progress()):
	probabilities = []
	total = len(texts)

	for i in range(0, total, BATCH_SIZE):
	progress((i, total))
	batch = texts[i:i + BATCH_SIZE]

	inputs = tokenizer(
	batch,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=MAX_LENGTH
	)

	logits = model(**inputs).logits
	probs = torch.softmax(logits, dim=1)[:, 1]
	probabilities.extend(probs.tolist())

	progress((total, total))
	return probabilities


	# =========================
	# CLASSIFICATION LOGIC
	# =========================
	def classify_chunks(chunks: List[str], progress=gr.Progress()) -> pd.DataFrame:
	probabilities = detect_ai_probability(chunks, progress)

	df = pd.DataFrame({
	"Text Chunk": chunks,
	"AI Probability (%)": [round(p * 100, 2) for p in probabilities],
	"Prediction": [
	"🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human"
	for p in probabilities
	],
	"Confidence": [
	calibrate_confidence(p) for p in probabilities
	]
	})

	return df


	def document_summary(df: pd.DataFrame) -> pd.DataFrame:
	high_conf = df[df["Confidence"] == "High"]
	avg_prob = df["AI Probability (%)"].mean()

	summary = pd.DataFrame([{
	"Text Chunk": "📄 Document Summary",
	"AI Probability (%)": round(avg_prob, 2),
	"Prediction": "🤖 Likely AI" if len(high_conf) >= len(df) * 0.6 else "🧍 Human",
	"Confidence": "High" if len(high_conf) >= len(df) * 0.6 else "Medium"
	}])

	return pd.concat([df, summary], ignore_index=True)


	# =========================
	# GAUGE VISUALIZATION
	# =========================
	def generate_gauge(prob_percent: float, prediction: str) -> str:
	fig, ax = plt.subplots(figsize=(6, 3))

	angles = np.linspace(np.pi, 0, 100)

	# Background arc
	ax.plot(np.cos(angles), np.sin(angles), linewidth=20, alpha=0.15)

	# Colored arc
	for i, val in enumerate(np.linspace(0, 100, 99)):
	if val < 40:
	color = "green"
	elif val < 70:
	color = "orange"
	else:
	color = "red"

	ax.plot(
	np.cos(angles[i:i + 2]),
	np.sin(angles[i:i + 2]),
	linewidth=20,
	color=color
	)

	# Needle
	needle_angle = np.pi * (1 - prob_percent / 100)
	ax.plot(
	[0, 0.8 * np.cos(needle_angle)],
	[0, 0.8 * np.sin(needle_angle)],
	linewidth=4
	)

	# Text
	ax.text(0, -0.1, f"{prob_percent:.0f}%", ha="center", va="center", fontsize=24, weight="bold")
	ax.text(0, -0.32, prediction, ha="center", va="center", fontsize=12)

	ax.set_aspect("equal")
	ax.axis("off")

	with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
	fig.savefig(tmp.name, bbox_inches="tight", dpi=150)
	path = tmp.name

	plt.close(fig)
	return path


	# =========================
	# GRADIO ENTRY FUNCTION
	# =========================
	def run_detector(text_input: str, uploaded_files, progress=gr.Progress()):
	texts = []

	if text_input.strip():
	texts.append(text_input.strip())

	if uploaded_files:
	for file in uploaded_files:
	texts.append(load_text_from_file(file.name))

	if not texts:
	return pd.DataFrame({"Error": ["No input provided"]}), None

	chunks = []
	for text in texts:
	chunks.extend(chunk_text(text))

	if not chunks:
	return pd.DataFrame({"Error": ["Text too short for analysis"]}), None

	df = classify_chunks(chunks, progress)
	final_df = document_summary(df)

	summary_row = final_df[final_df["Text Chunk"] == "📄 Document Summary"].iloc[0]
	gauge_path = generate_gauge(
	summary_row["AI Probability (%)"],
	summary_row["Prediction"]
	)

	return final_df, gauge_path


	# =========================
	# GRADIO UI (HF SPACE)
	# =========================
	with gr.Blocks(title="🧪 Offline AI Document Detector") as app:
	gr.Markdown("## 🧪 Offline AI Document Detector")
	gr.Markdown(
	"Detect whether content is AI-generated using an offline, open-source model. "
	"Supports PDF, DOCX, TXT, and pasted text. Optimized for CPU-only Hugging Face Spaces."
	)

	text_input = gr.Textbox(
	lines=6,
	label="✍️ Paste Text (optional)"
	)

	file_input = gr.File(
	label="📂 Upload Documents",
	file_types=[".pdf", ".docx", ".txt"],
	file_count="multiple"
	)

	analyze_btn = gr.Button("🔍 Analyze")
	output_table = gr.Dataframe(label="📊 Detection Results")
	gauge_plot = gr.Image(label="🧠 AI Probability Gauge")

	analyze_btn.click(
	fn=run_detector,
	inputs=[text_input, file_input],
	outputs=[output_table, gauge_plot]
	)

	if __name__ == "__main__":
	app.launch()