import gradio as gr import re import nltk from nltk.corpus import wordnet from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM import pandas as pd from textstat import textstat import spacy import requests from time import sleep import json import torch # Download necessary NLTK data try: nltk.download('wordnet', quiet=True) nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) except: print("NLTK data download failed. Some features may be limited.") # Load NER model for entity detection try: nlp = spacy.load("en_core_web_sm") except: try: spacy.cli.download("en_core_web_sm") nlp = spacy.load("en_core_web_sm") except: print("Spacy model loading failed. Entity recognition will be limited.") nlp = None # Load sentiment analysis pipeline try: sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") except: print("Sentiment analyzer loading failed. Sentiment analysis will be disabled.") sentiment_analyzer = None # Load grammar correction model try: grammar_model_name = "pszemraj/flan-t5-large-grammar-synthesis" grammar_tokenizer = AutoTokenizer.from_pretrained(grammar_model_name) grammar_model = AutoModelForSeq2SeqLM.from_pretrained(grammar_model_name) except: print("Grammar correction model loading failed. Will use alternative methods.") grammar_model = None grammar_tokenizer = None # Load text summarization model try: summarizer = pipeline("summarization", model="facebook/bart-large-cnn") except: print("Summarization model loading failed. Summarization will be disabled.") summarizer = None def get_synonyms(word): """Get synonyms for a word using WordNet""" synonyms = set() try: for syn in wordnet.synsets(word): for lemma in syn.lemmas(): synonyms.add(lemma.name().replace('_', ' ')) return list(synonyms)[:5] # Return up to 5 synonyms except: return [] def correct_grammar_with_model(text, max_length=512): """Use a transformer model to correct grammar""" if not grammar_model or not grammar_tokenizer: return text # Split text into chunks if too long chunks = [] sentences = nltk.sent_tokenize(text) current_chunk = "" for sentence in sentences: # If adding this sentence would make the chunk too long, save current chunk and start a new one if len(grammar_tokenizer.encode(current_chunk + " " + sentence)) > max_length: chunks.append(current_chunk) current_chunk = sentence else: if current_chunk: current_chunk += " " + sentence else: current_chunk = sentence # Add the last chunk if not empty if current_chunk: chunks.append(current_chunk) # Process each chunk corrected_chunks = [] for chunk in chunks: # Skip empty chunks if not chunk.strip(): continue inputs = grammar_tokenizer(f"grammar: {chunk}", return_tensors="pt", truncation=True, max_length=max_length) with torch.no_grad(): outputs = grammar_model.generate( inputs.input_ids, max_length=max_length, num_beams=5, early_stopping=True ) corrected = grammar_tokenizer.decode(outputs[0], skip_special_tokens=True) corrected_chunks.append(corrected) return " ".join(corrected_chunks) def find_grammar_issues(original_text, corrected_text): """Identify differences between original and corrected text""" issues = [] # Use simple tokenization to compare texts original_sentences = nltk.sent_tokenize(original_text) corrected_sentences = nltk.sent_tokenize(corrected_text) # Match up sentences and find differences min_len = min(len(original_sentences), len(corrected_sentences)) for i in range(min_len): if original_sentences[i] != corrected_sentences[i]: issues.append({ "original": original_sentences[i], "corrected": corrected_sentences[i], "position": original_text.find(original_sentences[i]) }) return issues def calculate_readability_metrics(text): """Calculate various readability metrics""" if not text.strip(): return {} try: return { "flesch_reading_ease": textstat.flesch_reading_ease(text), "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text), "gunning_fog": textstat.gunning_fog(text), "smog_index": textstat.smog_index(text), "automated_readability_index": textstat.automated_readability_index(text), "coleman_liau_index": textstat.coleman_liau_index(text), "reading_time": f"{textstat.reading_time(text, ms_per_char=14):.1f} seconds" } except: return {"error": "Readability calculation failed"} def find_repeated_words(text): """Find repeated words in close proximity""" words = text.lower().split() repeated = [] for i in range(len(words) - 5): window = words[i:i+5] for word in set(window): if len(word) > 3 and window.count(word) > 1: # Only consider words longer than 3 chars repeated.append(word) return list(set(repeated)) def identify_passive_voice(text): """Identify potential passive voice usage""" # Simple pattern matching for common passive voice constructions passive_patterns = [ r'\b(?:am|is|are|was|were|be|being|been)\s+(\w+ed)\b', r'\b(?:am|is|are|was|were|be|being|been)\s+(\w+en)\b' ] passive_instances = [] for pattern in passive_patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: start = max(0, match.start() - 20) end = min(len(text), match.end() + 20) context = text[start:end] passive_instances.append({ "match": match.group(0), "context": context, "position": match.start() }) return passive_instances def analyze_sentiment(text): """Analyze sentiment of the text""" if not sentiment_analyzer or len(text.strip()) < 5: # Skip very short text return {"label": "N/A", "score": 0} try: result = sentiment_analyzer(text[:512])[0] # Limit text length for the model return result except: return {"label": "Error", "score": 0} def extract_entities(text): """Extract named entities from text""" if not nlp: return [] entities = [] try: # Process text in chunks if it's too long max_chars = 100000 # spaCy default max length if len(text) > max_chars: chunks = [text[i:i+max_chars] for i in range(0, len(text), max_chars)] else: chunks = [text] for chunk in chunks: doc = nlp(chunk) for ent in doc.ents: entities.append({ "text": ent.text, "label": ent.label_, "start": ent.start_char, "end": ent.end_char }) except: pass return entities def suggest_simpler_vocabulary(text): """Suggest simpler alternatives for complex words""" # This is a simplified implementation complex_words = { "utilize": "use", "implement": "use", "facilitate": "help", "leverage": "use", "optimize": "improve", "commence": "start", "terminate": "end", "endeavor": "try", "cognizant": "aware", "prioritize": "focus on", "ascertain": "find out", "subsequent": "later", "initiate": "start", "finalize": "finish", "abundant": "many", "adequate": "enough", "demonstrate": "show", "encounter": "meet", "generate": "create", "observe": "see", "obtain": "get", "require": "need", "sufficient": "enough", "utilize": "use", "endeavour": "try", "comprehend": "understand", "procure": "get", "inquire": "ask", "commence": "begin", "purchase": "buy", "assist": "help" } suggestions = {} for word, replacement in complex_words.items(): if re.search(r'\b' + word + r'\b', text, re.IGNORECASE): suggestions[word] = replacement return suggestions def summarize_text(text, max_length=150, min_length=40): """Summarize the text using a pre-trained model""" if not summarizer or len(text.split()) < 30: # Don't summarize short text return "Text is too short for summarization" try: # Split into chunks if text is too long max_chunk_length = 1024 # Most summarization models have limits if len(text.split()) > max_chunk_length: sentences = nltk.sent_tokenize(text) chunks = [] current_chunk = [] current_length = 0 for sentence in sentences: sentence_length = len(sentence.split()) if current_length + sentence_length <= max_chunk_length: current_chunk.append(sentence) current_length += sentence_length else: chunks.append(" ".join(current_chunk)) current_chunk = [sentence] current_length = sentence_length if current_chunk: chunks.append(" ".join(current_chunk)) # Summarize each chunk and combine summaries = [] for chunk in chunks: summary = summarizer(chunk, max_length=min(max_length, len(chunk.split())), min_length=min(min_length, len(chunk.split())//2), do_sample=False)[0]['summary_text'] summaries.append(summary) return " ".join(summaries) else: return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text'] except Exception as e: return f"Summarization failed: {str(e)}" def analyze_formality(text): """Analyze the formality level of the text""" # Simple heuristics-based formality analysis formal_indicators = [ r'\b(?:however|therefore|thus|consequently|furthermore|moreover|nevertheless)\b', r'\b(?:shall|ought|whom|whereby|herein|therein|wherein)\b', r'\b(?:Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.)\b', r'\b(?:would like to|I am writing to)\b' ] informal_indicators = [ r'\b(?:yeah|nope|gonna|wanna|gotta|kinda|sorta)\b', r'(?:!{2,}|\?{2,})', r'\b(?:lol|omg|btw|imo|tbh)\b', r"(?:don't|won't|can't|shouldn't|wouldn't|isn't|aren't|haven't)", r'\b(?:awesome|cool|super|great|huge)\b' ] formal_count = 0 for pattern in formal_indicators: formal_count += len(re.findall(pattern, text, re.IGNORECASE)) informal_count = 0 for pattern in informal_indicators: informal_count += len(re.findall(pattern, text, re.IGNORECASE)) # Calculate formality score (simple version) word_count = len(text.split()) if word_count == 0: return {"formality_level": "Unknown", "score": 0.5} formal_ratio = formal_count / max(1, word_count) informal_ratio = informal_count / max(1, word_count) # Determine formality level if formal_ratio > 0.05 and formal_ratio > informal_ratio * 2: formality = "Formal" score = min(0.9, 0.5 + formal_ratio * 5) elif informal_ratio > 0.05 and informal_ratio > formal_ratio * 2: formality = "Informal" score = max(0.1, 0.5 - informal_ratio * 5) else: formality = "Neutral" score = 0.5 return {"formality_level": formality, "score": score} def detect_tone(text): """Detect the overall tone of the text""" if not text.strip(): return "Neutral" # Simple keyword-based tone detection tone_keywords = { "Professional": ["recommend", "inform", "request", "provide", "consider", "suggest", "propose", "analyze", "evaluate", "conclude"], "Academic": ["research", "study", "analysis", "theory", "hypothesis", "methodology", "findings", "literature", "experiment", "data"], "Friendly": ["thanks", "appreciate", "happy", "glad", "hope", "welcome", "please", "enjoy", "share", "connect"], "Persuasive": ["should", "must", "need", "important", "crucial", "essential", "significant", "consider", "believe", "argue"], "Urgent": ["immediately", "urgent", "asap", "quickly", "soon", "deadline", "critical", "emergency", "promptly", "hurry"], "Cautious": ["perhaps", "might", "may", "possible", "potentially", "suggest", "consider", "could", "seems", "appears"] } tone_scores = {tone: 0 for tone in tone_keywords} word_count = len(text.split()) # Count occurrences of tone keywords for tone, keywords in tone_keywords.items(): for keyword in keywords: tone_scores[tone] += len(re.findall(r'\b' + keyword + r'\b', text, re.IGNORECASE)) # Normalize by word count for tone in tone_scores: tone_scores[tone] = tone_scores[tone] / max(1, word_count) # Find the most dominant tone dominant_tone = max(tone_scores.items(), key=lambda x: x[1]) # Only return a specific tone if it's significantly present if dominant_tone[1] > 0.02: return dominant_tone[0] else: return "Neutral" def text_analysis(text): """Comprehensive text analysis""" if not text.strip(): return { "grammar_issues": [], "corrected_text": "", "readability": {}, "repeated_words": [], "passive_voice": [], "sentiment": {"label": "N/A", "score": 0}, "entities": [], "simpler_vocabulary": {}, "formality": {"formality_level": "Unknown", "score": 0.5}, "tone": "Neutral", "summary": "", "word_count": 0, "sentence_count": 0, "average_sentence_length": 0 } # Basic text stats word_count = len(text.split()) sentences = nltk.sent_tokenize(text) sentence_count = len(sentences) avg_sentence_length = word_count / max(sentence_count, 1) # Correct grammar with AI model corrected_text = correct_grammar_with_model(text) # Find grammar issues by comparing original and corrected text grammar_issues = find_grammar_issues(text, corrected_text) # Run all analysis functions readability = calculate_readability_metrics(text) repeated_words = find_repeated_words(text) passive_voice = identify_passive_voice(text) sentiment = analyze_sentiment(text) entities = extract_entities(text) simpler_words = suggest_simpler_vocabulary(text) formality = analyze_formality(text) tone = detect_tone(text) # Generate summary for longer text summary = "" if word_count > 50: summary = summarize_text(text) return { "grammar_issues": grammar_issues, "corrected_text": corrected_text, "readability": readability, "repeated_words": repeated_words, "passive_voice": passive_voice, "sentiment": sentiment, "entities": entities, "simpler_vocabulary": simpler_words, "formality": formality, "tone": tone, "summary": summary, "word_count": word_count, "sentence_count": sentence_count, "average_sentence_length": avg_sentence_length } def format_grammar_issues(issues): if not issues: return "No grammar issues found." result = "Grammar Issues Found:\n\n" for i, issue in enumerate(issues, 1): result += f"{i}. Original: \"{issue['original']}\"\n" result += f" Corrected: \"{issue['corrected']}\"\n\n" return result def format_readability(metrics): if not metrics: return "Readability metrics not available." if "error" in metrics: return f"Error: {metrics['error']}" # Define interpretations for Flesch Reading Ease def interpret_flesch(score): if score >= 90: return "Very Easy (5th grade)" elif score >= 80: return "Easy (6th grade)" elif score >= 70: return "Fairly Easy (7th grade)" elif score >= 60: return "Standard (8th-9th grade)" elif score >= 50: return "Fairly Difficult (10th-12th grade)" elif score >= 30: return "Difficult (College)" else: return "Very Difficult (College Graduate)" result = "Readability Analysis:\n\n" result += f"• Flesch Reading Ease: {metrics['flesch_reading_ease']:.1f} - {interpret_flesch(metrics['flesch_reading_ease'])}\n" result += f"• Flesch-Kincaid Grade Level: {metrics['flesch_kincaid_grade']:.1f}\n" result += f"• Gunning Fog Index: {metrics['gunning_fog']:.1f}\n" result += f"• SMOG Index: {metrics['smog_index']:.1f}\n" result += f"• Automated Readability Index: {metrics['automated_readability_index']:.1f}\n" result += f"• Coleman-Liau Index: {metrics['coleman_liau_index']:.1f}\n" result += f"• Estimated Reading Time: {metrics['reading_time']}" return result def format_passive_voice(passive_instances): if not passive_instances: return "No passive voice detected." result = f"Passive Voice Detected ({len(passive_instances)} instances):\n\n" for i, instance in enumerate(passive_instances[:5], 1): # Show up to 5 examples result += f"{i}. \"...{instance['context']}...\"\n" if len(passive_instances) > 5: result += f"\nand {len(passive_instances) - 5} more..." return result def format_entities(entities): if not entities: return "No named entities detected." # Group entities by type entity_groups = {} for entity in entities: if entity['label'] not in entity_groups: entity_groups[entity['label']] = [] entity_groups[entity['label']].append(entity['text']) result = "Named Entities Detected:\n\n" for label, items in entity_groups.items(): unique_items = list(set(items))[:5] # Show up to 5 unique entities per type result += f"• {label}: {', '.join(unique_items)}" if len(set(items)) > 5: result += f" and {len(set(items)) - 5} more" result += "\n" return result def format_vocabulary_suggestions(suggestions): if not suggestions: return "No vocabulary simplification suggestions." result = "Vocabulary Simplification Suggestions:\n\n" for complex_word, simple_word in suggestions.items(): result += f"• \"{complex_word}\" → \"{simple_word}\"\n" return result def build_interface(): with gr.Blocks(title="AI Grammar & Style Assistant", theme=gr.themes.Soft()) as app: gr.Markdown("# 📝 AI Grammar & Style Assistant") gr.Markdown("Powered by AI to help improve your writing with advanced grammar checking, style suggestions, and more!") with gr.Tab("Text Analysis"): with gr.Row(): with gr.Column(scale=3): input_text = gr.Textbox( label="Enter your text here", placeholder="Type or paste your text here for analysis...", lines=10 ) analyze_btn = gr.Button("Analyze Text", variant="primary") with gr.Column(scale=3): corrected_output = gr.Textbox(label="Corrected Text", lines=10) with gr.Row(): with gr.Column(): grammar_issues = gr.Textbox(label="Grammar Issues", lines=6) readability_metrics = gr.Textbox(label="Readability Analysis", lines=10) with gr.Column(): passive_voice_output = gr.Textbox(label="Passive Voice Detection", lines=6) vocab_suggestions = gr.Textbox(label="Vocabulary Suggestions", lines=6) with gr.Row(): with gr.Column(): entity_detection = gr.Textbox(label="Entity Detection", lines=6) with gr.Column(): formality_tone = gr.Textbox(label="Formality & Tone Analysis", lines=6) with gr.Row(): text_summary = gr.Textbox(label="Text Summary", lines=4) with gr.Row(): text_stats = gr.JSON(label="Text Statistics") with gr.Tab("Help & Information"): gr.Markdown(""" ## How to Use This Tool 1. Enter or paste your text in the input box 2. Click "Analyze Text" 3. Review the analysis results across all categories ## Features - **Grammar Correction**: AI-powered grammar correction using advanced language models - **Readability Analysis**: Multiple readability metrics including Flesch Reading Ease, Gunning Fog, and more - **Style Improvement**: Detects passive voice, repeated words, and complex vocabulary - **Named Entity Recognition**: Identifies people, organizations, locations, and more - **Sentiment Analysis**: Detects the emotional tone of your text - **Formality Analysis**: Determines if your text is formal, neutral, or informal - **Text Summarization**: Creates a concise summary of longer texts - **Tone Detection**: Identifies the overall tone (professional, academic, friendly, etc.) ## About This is an AI-powered writing assistant similar to Grammarly, built with Python, Gradio, and Hugging Face transformer models. """) def process_text(text): """Process the input text and return all analysis results""" if not text.strip(): return ("", "No text to analyze.", "No text to analyze.", "No text to analyze.", "No text to analyze.", "No text to analyze.", "No text to analyze.", "No text to analyze.", {}) # Perform comprehensive analysis results = text_analysis(text) # Format corrected text corrected = results["corrected_text"] if results["corrected_text"] else text # Format grammar issues grammar_output = format_grammar_issues(results["grammar_issues"]) # Format readability metrics readability_output = format_readability(results["readability"]) # Format passive voice detection passive_output = format_passive_voice(results["passive_voice"]) # Format entity detection entities_output = format_entities(results["entities"]) # Format vocabulary suggestions vocab_output = format_vocabulary_suggestions(results["simpler_vocabulary"]) # Format formality and tone formality_tone_output = f"Formality: {results['formality']['formality_level']} (Score: {results['formality']['score']:.2f})\nTone: {results['tone']}" # Format summary summary_output = results["summary"] if results["summary"] else "Summary not available for this text." # Format text statistics stats = { "Word Count": results["word_count"], "Sentence Count": results["sentence_count"], "Average Sentence Length": f"{results['average_sentence_length']:.1f} words", "Repeated Words": results["repeated_words"], "Sentiment": f"{results['sentiment']['label']} (Score: {results['sentiment']['score']:.2f})" } return (corrected, grammar_output, readability_output, passive_output, vocab_output, entities_output, formality_tone_output, summary_output, stats) analyze_btn.click( process_text, inputs=[input_text], outputs=[corrected_output, grammar_issues, readability_metrics, passive_voice_output, vocab_suggestions, entity_detection, formality_tone, text_summary, text_stats] ) return app # Create and launch the interface app = build_interface() # For Hugging Face Spaces deployment if __name__ == "__main__": app.launch()