| import os |
| import tempfile |
| from pathlib import Path |
| from typing import List |
| import numpy as np |
|
|
| import gradio as gr |
| import pandas as pd |
| import torch |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| import docx |
| import matplotlib.pyplot as plt |
|
|
| try: |
| import fitz |
| except ImportError as e: |
| raise ImportError("Missing dependency: PyMuPDF") from e |
|
|
|
|
| |
| |
| |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" |
| torch.set_num_threads(2) |
| torch.set_grad_enabled(False) |
|
|
| |
| |
| |
| MODEL_NAME = "openai-community/roberta-base-openai-detector" |
| AI_THRESHOLD = 0.5 |
| MAX_LENGTH = 256 |
| BATCH_SIZE = 8 |
| DEVICE = "cpu" |
|
|
| SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"} |
|
|
| |
| |
| |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) |
| model.to(DEVICE) |
| model.eval() |
|
|
|
|
| |
| |
| |
| def load_text_from_file(file_path: str) -> str: |
| path = Path(file_path) |
|
|
| if path.suffix.lower() not in SUPPORTED_EXTENSIONS: |
| raise ValueError(f"Unsupported file type: {path.suffix}") |
|
|
| if path.suffix == ".txt": |
| return path.read_text(encoding="utf-8", errors="ignore") |
|
|
| if path.suffix == ".pdf": |
| text = [] |
| with fitz.open(path) as pdf: |
| for page in pdf: |
| text.append(page.get_text()) |
| return "\n".join(text) |
|
|
| if path.suffix == ".docx": |
| document = docx.Document(path) |
| return "\n".join(p.text for p in document.paragraphs if p.text.strip()) |
|
|
|
|
| |
| |
| |
| def chunk_text(text: str, max_words: int = 200) -> List[str]: |
| words = text.split() |
| chunks = [] |
|
|
| for i in range(0, len(words), max_words): |
| chunk = " ".join(words[i:i + max_words]) |
| if len(chunk.split()) >= 20: |
| chunks.append(chunk) |
|
|
| return chunks |
|
|
|
|
| |
| |
| |
| def calibrate_confidence(prob: float) -> str: |
| distance = abs(prob - AI_THRESHOLD) |
| if distance >= 0.35: |
| return "High" |
| elif distance >= 0.15: |
| return "Medium" |
| return "Low" |
|
|
|
|
| |
| |
| |
| @torch.no_grad() |
| def detect_ai_probability(texts: List[str], progress=gr.Progress()): |
| probabilities = [] |
| total = len(texts) |
|
|
| for i in range(0, total, BATCH_SIZE): |
| progress((i, total)) |
| batch = texts[i:i + BATCH_SIZE] |
|
|
| inputs = tokenizer( |
| batch, |
| return_tensors="pt", |
| padding=True, |
| truncation=True, |
| max_length=MAX_LENGTH |
| ) |
|
|
| logits = model(**inputs).logits |
| probs = torch.softmax(logits, dim=1)[:, 1] |
| probabilities.extend(probs.tolist()) |
|
|
| progress((total, total)) |
| return probabilities |
|
|
|
|
| |
| |
| |
| def classify_chunks(chunks: List[str], progress=gr.Progress()) -> pd.DataFrame: |
| probabilities = detect_ai_probability(chunks, progress) |
|
|
| df = pd.DataFrame({ |
| "Text Chunk": chunks, |
| "AI Probability (%)": [round(p * 100, 2) for p in probabilities], |
| "Prediction": [ |
| "🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human" |
| for p in probabilities |
| ], |
| "Confidence": [ |
| calibrate_confidence(p) for p in probabilities |
| ] |
| }) |
|
|
| return df |
|
|
|
|
| def document_summary(df: pd.DataFrame) -> pd.DataFrame: |
| high_conf = df[df["Confidence"] == "High"] |
| avg_prob = df["AI Probability (%)"].mean() |
|
|
| summary = pd.DataFrame([{ |
| "Text Chunk": "📄 Document Summary", |
| "AI Probability (%)": round(avg_prob, 2), |
| "Prediction": "🤖 Likely AI" if len(high_conf) >= len(df) * 0.6 else "🧍 Human", |
| "Confidence": "High" if len(high_conf) >= len(df) * 0.6 else "Medium" |
| }]) |
|
|
| return pd.concat([df, summary], ignore_index=True) |
|
|
|
|
| |
| |
| |
| def generate_gauge(prob_percent: float, prediction: str) -> str: |
| fig, ax = plt.subplots(figsize=(6, 3)) |
|
|
| angles = np.linspace(np.pi, 0, 100) |
|
|
| |
| ax.plot(np.cos(angles), np.sin(angles), linewidth=20, alpha=0.15) |
|
|
| |
| for i, val in enumerate(np.linspace(0, 100, 99)): |
| if val < 40: |
| color = "green" |
| elif val < 70: |
| color = "orange" |
| else: |
| color = "red" |
|
|
| ax.plot( |
| np.cos(angles[i:i + 2]), |
| np.sin(angles[i:i + 2]), |
| linewidth=20, |
| color=color |
| ) |
|
|
| |
| needle_angle = np.pi * (1 - prob_percent / 100) |
| ax.plot( |
| [0, 0.8 * np.cos(needle_angle)], |
| [0, 0.8 * np.sin(needle_angle)], |
| linewidth=4 |
| ) |
|
|
| |
| ax.text(0, -0.1, f"{prob_percent:.0f}%", ha="center", va="center", fontsize=24, weight="bold") |
| ax.text(0, -0.32, prediction, ha="center", va="center", fontsize=12) |
|
|
| ax.set_aspect("equal") |
| ax.axis("off") |
|
|
| with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp: |
| fig.savefig(tmp.name, bbox_inches="tight", dpi=150) |
| path = tmp.name |
|
|
| plt.close(fig) |
| return path |
|
|
|
|
| |
| |
| |
| def run_detector(text_input: str, uploaded_files, progress=gr.Progress()): |
| texts = [] |
|
|
| if text_input.strip(): |
| texts.append(text_input.strip()) |
|
|
| if uploaded_files: |
| for file in uploaded_files: |
| texts.append(load_text_from_file(file.name)) |
|
|
| if not texts: |
| return pd.DataFrame({"Error": ["No input provided"]}), None |
|
|
| chunks = [] |
| for text in texts: |
| chunks.extend(chunk_text(text)) |
|
|
| if not chunks: |
| return pd.DataFrame({"Error": ["Text too short for analysis"]}), None |
|
|
| df = classify_chunks(chunks, progress) |
| final_df = document_summary(df) |
|
|
| summary_row = final_df[final_df["Text Chunk"] == "📄 Document Summary"].iloc[0] |
| gauge_path = generate_gauge( |
| summary_row["AI Probability (%)"], |
| summary_row["Prediction"] |
| ) |
|
|
| return final_df, gauge_path |
|
|
|
|
| |
| |
| |
| with gr.Blocks(title="🧪 Offline AI Document Detector") as app: |
| gr.Markdown("## 🧪 Offline AI Document Detector") |
| gr.Markdown( |
| "Detect whether content is AI-generated using an **offline, open-source model**. " |
| "Supports **PDF, DOCX, TXT, and pasted text**. Optimized for **CPU-only Hugging Face Spaces**." |
| ) |
|
|
| text_input = gr.Textbox( |
| lines=6, |
| label="✍️ Paste Text (optional)" |
| ) |
|
|
| file_input = gr.File( |
| label="📂 Upload Documents", |
| file_types=[".pdf", ".docx", ".txt"], |
| file_count="multiple" |
| ) |
|
|
| analyze_btn = gr.Button("🔍 Analyze") |
| output_table = gr.Dataframe(label="📊 Detection Results") |
| gauge_plot = gr.Image(label="🧠 AI Probability Gauge") |
|
|
| analyze_btn.click( |
| fn=run_detector, |
| inputs=[text_input, file_input], |
| outputs=[output_table, gauge_plot] |
| ) |
|
|
| if __name__ == "__main__": |
| app.launch() |
|
|