Grammerly / app.py
kmondal91's picture
Create app.py
ba312e6 verified
import gradio as gr
import re
import nltk
from nltk.corpus import wordnet
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from textstat import textstat
import spacy
import requests
from time import sleep
import json
import torch
# Download necessary NLTK data
try:
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
except:
print("NLTK data download failed. Some features may be limited.")
# Load NER model for entity detection
try:
nlp = spacy.load("en_core_web_sm")
except:
try:
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
except:
print("Spacy model loading failed. Entity recognition will be limited.")
nlp = None
# Load sentiment analysis pipeline
try:
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
except:
print("Sentiment analyzer loading failed. Sentiment analysis will be disabled.")
sentiment_analyzer = None
# Load grammar correction model
try:
grammar_model_name = "pszemraj/flan-t5-large-grammar-synthesis"
grammar_tokenizer = AutoTokenizer.from_pretrained(grammar_model_name)
grammar_model = AutoModelForSeq2SeqLM.from_pretrained(grammar_model_name)
except:
print("Grammar correction model loading failed. Will use alternative methods.")
grammar_model = None
grammar_tokenizer = None
# Load text summarization model
try:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except:
print("Summarization model loading failed. Summarization will be disabled.")
summarizer = None
def get_synonyms(word):
"""Get synonyms for a word using WordNet"""
synonyms = set()
try:
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.add(lemma.name().replace('_', ' '))
return list(synonyms)[:5] # Return up to 5 synonyms
except:
return []
def correct_grammar_with_model(text, max_length=512):
"""Use a transformer model to correct grammar"""
if not grammar_model or not grammar_tokenizer:
return text
# Split text into chunks if too long
chunks = []
sentences = nltk.sent_tokenize(text)
current_chunk = ""
for sentence in sentences:
# If adding this sentence would make the chunk too long, save current chunk and start a new one
if len(grammar_tokenizer.encode(current_chunk + " " + sentence)) > max_length:
chunks.append(current_chunk)
current_chunk = sentence
else:
if current_chunk:
current_chunk += " " + sentence
else:
current_chunk = sentence
# Add the last chunk if not empty
if current_chunk:
chunks.append(current_chunk)
# Process each chunk
corrected_chunks = []
for chunk in chunks:
# Skip empty chunks
if not chunk.strip():
continue
inputs = grammar_tokenizer(f"grammar: {chunk}", return_tensors="pt", truncation=True, max_length=max_length)
with torch.no_grad():
outputs = grammar_model.generate(
inputs.input_ids,
max_length=max_length,
num_beams=5,
early_stopping=True
)
corrected = grammar_tokenizer.decode(outputs[0], skip_special_tokens=True)
corrected_chunks.append(corrected)
return " ".join(corrected_chunks)
def find_grammar_issues(original_text, corrected_text):
"""Identify differences between original and corrected text"""
issues = []
# Use simple tokenization to compare texts
original_sentences = nltk.sent_tokenize(original_text)
corrected_sentences = nltk.sent_tokenize(corrected_text)
# Match up sentences and find differences
min_len = min(len(original_sentences), len(corrected_sentences))
for i in range(min_len):
if original_sentences[i] != corrected_sentences[i]:
issues.append({
"original": original_sentences[i],
"corrected": corrected_sentences[i],
"position": original_text.find(original_sentences[i])
})
return issues
def calculate_readability_metrics(text):
"""Calculate various readability metrics"""
if not text.strip():
return {}
try:
return {
"flesch_reading_ease": textstat.flesch_reading_ease(text),
"flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
"gunning_fog": textstat.gunning_fog(text),
"smog_index": textstat.smog_index(text),
"automated_readability_index": textstat.automated_readability_index(text),
"coleman_liau_index": textstat.coleman_liau_index(text),
"reading_time": f"{textstat.reading_time(text, ms_per_char=14):.1f} seconds"
}
except:
return {"error": "Readability calculation failed"}
def find_repeated_words(text):
"""Find repeated words in close proximity"""
words = text.lower().split()
repeated = []
for i in range(len(words) - 5):
window = words[i:i+5]
for word in set(window):
if len(word) > 3 and window.count(word) > 1: # Only consider words longer than 3 chars
repeated.append(word)
return list(set(repeated))
def identify_passive_voice(text):
"""Identify potential passive voice usage"""
# Simple pattern matching for common passive voice constructions
passive_patterns = [
r'\b(?:am|is|are|was|were|be|being|been)\s+(\w+ed)\b',
r'\b(?:am|is|are|was|were|be|being|been)\s+(\w+en)\b'
]
passive_instances = []
for pattern in passive_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
start = max(0, match.start() - 20)
end = min(len(text), match.end() + 20)
context = text[start:end]
passive_instances.append({
"match": match.group(0),
"context": context,
"position": match.start()
})
return passive_instances
def analyze_sentiment(text):
"""Analyze sentiment of the text"""
if not sentiment_analyzer or len(text.strip()) < 5: # Skip very short text
return {"label": "N/A", "score": 0}
try:
result = sentiment_analyzer(text[:512])[0] # Limit text length for the model
return result
except:
return {"label": "Error", "score": 0}
def extract_entities(text):
"""Extract named entities from text"""
if not nlp:
return []
entities = []
try:
# Process text in chunks if it's too long
max_chars = 100000 # spaCy default max length
if len(text) > max_chars:
chunks = [text[i:i+max_chars] for i in range(0, len(text), max_chars)]
else:
chunks = [text]
for chunk in chunks:
doc = nlp(chunk)
for ent in doc.ents:
entities.append({
"text": ent.text,
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char
})
except:
pass
return entities
def suggest_simpler_vocabulary(text):
"""Suggest simpler alternatives for complex words"""
# This is a simplified implementation
complex_words = {
"utilize": "use",
"implement": "use",
"facilitate": "help",
"leverage": "use",
"optimize": "improve",
"commence": "start",
"terminate": "end",
"endeavor": "try",
"cognizant": "aware",
"prioritize": "focus on",
"ascertain": "find out",
"subsequent": "later",
"initiate": "start",
"finalize": "finish",
"abundant": "many",
"adequate": "enough",
"demonstrate": "show",
"encounter": "meet",
"generate": "create",
"observe": "see",
"obtain": "get",
"require": "need",
"sufficient": "enough",
"utilize": "use",
"endeavour": "try",
"comprehend": "understand",
"procure": "get",
"inquire": "ask",
"commence": "begin",
"purchase": "buy",
"assist": "help"
}
suggestions = {}
for word, replacement in complex_words.items():
if re.search(r'\b' + word + r'\b', text, re.IGNORECASE):
suggestions[word] = replacement
return suggestions
def summarize_text(text, max_length=150, min_length=40):
"""Summarize the text using a pre-trained model"""
if not summarizer or len(text.split()) < 30: # Don't summarize short text
return "Text is too short for summarization"
try:
# Split into chunks if text is too long
max_chunk_length = 1024 # Most summarization models have limits
if len(text.split()) > max_chunk_length:
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence.split())
if current_length + sentence_length <= max_chunk_length:
current_chunk.append(sentence)
current_length += sentence_length
else:
chunks.append(" ".join(current_chunk))
current_chunk = [sentence]
current_length = sentence_length
if current_chunk:
chunks.append(" ".join(current_chunk))
# Summarize each chunk and combine
summaries = []
for chunk in chunks:
summary = summarizer(chunk, max_length=min(max_length, len(chunk.split())),
min_length=min(min_length, len(chunk.split())//2),
do_sample=False)[0]['summary_text']
summaries.append(summary)
return " ".join(summaries)
else:
return summarizer(text, max_length=max_length, min_length=min_length,
do_sample=False)[0]['summary_text']
except Exception as e:
return f"Summarization failed: {str(e)}"
def analyze_formality(text):
"""Analyze the formality level of the text"""
# Simple heuristics-based formality analysis
formal_indicators = [
r'\b(?:however|therefore|thus|consequently|furthermore|moreover|nevertheless)\b',
r'\b(?:shall|ought|whom|whereby|herein|therein|wherein)\b',
r'\b(?:Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.)\b',
r'\b(?:would like to|I am writing to)\b'
]
informal_indicators = [
r'\b(?:yeah|nope|gonna|wanna|gotta|kinda|sorta)\b',
r'(?:!{2,}|\?{2,})',
r'\b(?:lol|omg|btw|imo|tbh)\b',
r"(?:don't|won't|can't|shouldn't|wouldn't|isn't|aren't|haven't)",
r'\b(?:awesome|cool|super|great|huge)\b'
]
formal_count = 0
for pattern in formal_indicators:
formal_count += len(re.findall(pattern, text, re.IGNORECASE))
informal_count = 0
for pattern in informal_indicators:
informal_count += len(re.findall(pattern, text, re.IGNORECASE))
# Calculate formality score (simple version)
word_count = len(text.split())
if word_count == 0:
return {"formality_level": "Unknown", "score": 0.5}
formal_ratio = formal_count / max(1, word_count)
informal_ratio = informal_count / max(1, word_count)
# Determine formality level
if formal_ratio > 0.05 and formal_ratio > informal_ratio * 2:
formality = "Formal"
score = min(0.9, 0.5 + formal_ratio * 5)
elif informal_ratio > 0.05 and informal_ratio > formal_ratio * 2:
formality = "Informal"
score = max(0.1, 0.5 - informal_ratio * 5)
else:
formality = "Neutral"
score = 0.5
return {"formality_level": formality, "score": score}
def detect_tone(text):
"""Detect the overall tone of the text"""
if not text.strip():
return "Neutral"
# Simple keyword-based tone detection
tone_keywords = {
"Professional": ["recommend", "inform", "request", "provide", "consider", "suggest", "propose", "analyze", "evaluate", "conclude"],
"Academic": ["research", "study", "analysis", "theory", "hypothesis", "methodology", "findings", "literature", "experiment", "data"],
"Friendly": ["thanks", "appreciate", "happy", "glad", "hope", "welcome", "please", "enjoy", "share", "connect"],
"Persuasive": ["should", "must", "need", "important", "crucial", "essential", "significant", "consider", "believe", "argue"],
"Urgent": ["immediately", "urgent", "asap", "quickly", "soon", "deadline", "critical", "emergency", "promptly", "hurry"],
"Cautious": ["perhaps", "might", "may", "possible", "potentially", "suggest", "consider", "could", "seems", "appears"]
}
tone_scores = {tone: 0 for tone in tone_keywords}
word_count = len(text.split())
# Count occurrences of tone keywords
for tone, keywords in tone_keywords.items():
for keyword in keywords:
tone_scores[tone] += len(re.findall(r'\b' + keyword + r'\b', text, re.IGNORECASE))
# Normalize by word count
for tone in tone_scores:
tone_scores[tone] = tone_scores[tone] / max(1, word_count)
# Find the most dominant tone
dominant_tone = max(tone_scores.items(), key=lambda x: x[1])
# Only return a specific tone if it's significantly present
if dominant_tone[1] > 0.02:
return dominant_tone[0]
else:
return "Neutral"
def text_analysis(text):
"""Comprehensive text analysis"""
if not text.strip():
return {
"grammar_issues": [],
"corrected_text": "",
"readability": {},
"repeated_words": [],
"passive_voice": [],
"sentiment": {"label": "N/A", "score": 0},
"entities": [],
"simpler_vocabulary": {},
"formality": {"formality_level": "Unknown", "score": 0.5},
"tone": "Neutral",
"summary": "",
"word_count": 0,
"sentence_count": 0,
"average_sentence_length": 0
}
# Basic text stats
word_count = len(text.split())
sentences = nltk.sent_tokenize(text)
sentence_count = len(sentences)
avg_sentence_length = word_count / max(sentence_count, 1)
# Correct grammar with AI model
corrected_text = correct_grammar_with_model(text)
# Find grammar issues by comparing original and corrected text
grammar_issues = find_grammar_issues(text, corrected_text)
# Run all analysis functions
readability = calculate_readability_metrics(text)
repeated_words = find_repeated_words(text)
passive_voice = identify_passive_voice(text)
sentiment = analyze_sentiment(text)
entities = extract_entities(text)
simpler_words = suggest_simpler_vocabulary(text)
formality = analyze_formality(text)
tone = detect_tone(text)
# Generate summary for longer text
summary = ""
if word_count > 50:
summary = summarize_text(text)
return {
"grammar_issues": grammar_issues,
"corrected_text": corrected_text,
"readability": readability,
"repeated_words": repeated_words,
"passive_voice": passive_voice,
"sentiment": sentiment,
"entities": entities,
"simpler_vocabulary": simpler_words,
"formality": formality,
"tone": tone,
"summary": summary,
"word_count": word_count,
"sentence_count": sentence_count,
"average_sentence_length": avg_sentence_length
}
def format_grammar_issues(issues):
if not issues:
return "No grammar issues found."
result = "Grammar Issues Found:\n\n"
for i, issue in enumerate(issues, 1):
result += f"{i}. Original: \"{issue['original']}\"\n"
result += f" Corrected: \"{issue['corrected']}\"\n\n"
return result
def format_readability(metrics):
if not metrics:
return "Readability metrics not available."
if "error" in metrics:
return f"Error: {metrics['error']}"
# Define interpretations for Flesch Reading Ease
def interpret_flesch(score):
if score >= 90: return "Very Easy (5th grade)"
elif score >= 80: return "Easy (6th grade)"
elif score >= 70: return "Fairly Easy (7th grade)"
elif score >= 60: return "Standard (8th-9th grade)"
elif score >= 50: return "Fairly Difficult (10th-12th grade)"
elif score >= 30: return "Difficult (College)"
else: return "Very Difficult (College Graduate)"
result = "Readability Analysis:\n\n"
result += f"• Flesch Reading Ease: {metrics['flesch_reading_ease']:.1f} - {interpret_flesch(metrics['flesch_reading_ease'])}\n"
result += f"• Flesch-Kincaid Grade Level: {metrics['flesch_kincaid_grade']:.1f}\n"
result += f"• Gunning Fog Index: {metrics['gunning_fog']:.1f}\n"
result += f"• SMOG Index: {metrics['smog_index']:.1f}\n"
result += f"• Automated Readability Index: {metrics['automated_readability_index']:.1f}\n"
result += f"• Coleman-Liau Index: {metrics['coleman_liau_index']:.1f}\n"
result += f"• Estimated Reading Time: {metrics['reading_time']}"
return result
def format_passive_voice(passive_instances):
if not passive_instances:
return "No passive voice detected."
result = f"Passive Voice Detected ({len(passive_instances)} instances):\n\n"
for i, instance in enumerate(passive_instances[:5], 1): # Show up to 5 examples
result += f"{i}. \"...{instance['context']}...\"\n"
if len(passive_instances) > 5:
result += f"\nand {len(passive_instances) - 5} more..."
return result
def format_entities(entities):
if not entities:
return "No named entities detected."
# Group entities by type
entity_groups = {}
for entity in entities:
if entity['label'] not in entity_groups:
entity_groups[entity['label']] = []
entity_groups[entity['label']].append(entity['text'])
result = "Named Entities Detected:\n\n"
for label, items in entity_groups.items():
unique_items = list(set(items))[:5] # Show up to 5 unique entities per type
result += f"• {label}: {', '.join(unique_items)}"
if len(set(items)) > 5:
result += f" and {len(set(items)) - 5} more"
result += "\n"
return result
def format_vocabulary_suggestions(suggestions):
if not suggestions:
return "No vocabulary simplification suggestions."
result = "Vocabulary Simplification Suggestions:\n\n"
for complex_word, simple_word in suggestions.items():
result += f"• \"{complex_word}\" → \"{simple_word}\"\n"
return result
def build_interface():
with gr.Blocks(title="AI Grammar & Style Assistant", theme=gr.themes.Soft()) as app:
gr.Markdown("# 📝 AI Grammar & Style Assistant")
gr.Markdown("Powered by AI to help improve your writing with advanced grammar checking, style suggestions, and more!")
with gr.Tab("Text Analysis"):
with gr.Row():
with gr.Column(scale=3):
input_text = gr.Textbox(
label="Enter your text here",
placeholder="Type or paste your text here for analysis...",
lines=10
)
analyze_btn = gr.Button("Analyze Text", variant="primary")
with gr.Column(scale=3):
corrected_output = gr.Textbox(label="Corrected Text", lines=10)
with gr.Row():
with gr.Column():
grammar_issues = gr.Textbox(label="Grammar Issues", lines=6)
readability_metrics = gr.Textbox(label="Readability Analysis", lines=10)
with gr.Column():
passive_voice_output = gr.Textbox(label="Passive Voice Detection", lines=6)
vocab_suggestions = gr.Textbox(label="Vocabulary Suggestions", lines=6)
with gr.Row():
with gr.Column():
entity_detection = gr.Textbox(label="Entity Detection", lines=6)
with gr.Column():
formality_tone = gr.Textbox(label="Formality & Tone Analysis", lines=6)
with gr.Row():
text_summary = gr.Textbox(label="Text Summary", lines=4)
with gr.Row():
text_stats = gr.JSON(label="Text Statistics")
with gr.Tab("Help & Information"):
gr.Markdown("""
## How to Use This Tool
1. Enter or paste your text in the input box
2. Click "Analyze Text"
3. Review the analysis results across all categories
## Features
- **Grammar Correction**: AI-powered grammar correction using advanced language models
- **Readability Analysis**: Multiple readability metrics including Flesch Reading Ease, Gunning Fog, and more
- **Style Improvement**: Detects passive voice, repeated words, and complex vocabulary
- **Named Entity Recognition**: Identifies people, organizations, locations, and more
- **Sentiment Analysis**: Detects the emotional tone of your text
- **Formality Analysis**: Determines if your text is formal, neutral, or informal
- **Text Summarization**: Creates a concise summary of longer texts
- **Tone Detection**: Identifies the overall tone (professional, academic, friendly, etc.)
## About
This is an AI-powered writing assistant similar to Grammarly, built with Python, Gradio, and Hugging Face transformer models.
""")
def process_text(text):
"""Process the input text and return all analysis results"""
if not text.strip():
return ("", "No text to analyze.", "No text to analyze.", "No text to analyze.",
"No text to analyze.", "No text to analyze.", "No text to analyze.", "No text to analyze.",
{})
# Perform comprehensive analysis
results = text_analysis(text)
# Format corrected text
corrected = results["corrected_text"] if results["corrected_text"] else text
# Format grammar issues
grammar_output = format_grammar_issues(results["grammar_issues"])
# Format readability metrics
readability_output = format_readability(results["readability"])
# Format passive voice detection
passive_output = format_passive_voice(results["passive_voice"])
# Format entity detection
entities_output = format_entities(results["entities"])
# Format vocabulary suggestions
vocab_output = format_vocabulary_suggestions(results["simpler_vocabulary"])
# Format formality and tone
formality_tone_output = f"Formality: {results['formality']['formality_level']} (Score: {results['formality']['score']:.2f})\nTone: {results['tone']}"
# Format summary
summary_output = results["summary"] if results["summary"] else "Summary not available for this text."
# Format text statistics
stats = {
"Word Count": results["word_count"],
"Sentence Count": results["sentence_count"],
"Average Sentence Length": f"{results['average_sentence_length']:.1f} words",
"Repeated Words": results["repeated_words"],
"Sentiment": f"{results['sentiment']['label']} (Score: {results['sentiment']['score']:.2f})"
}
return (corrected, grammar_output, readability_output, passive_output,
vocab_output, entities_output, formality_tone_output, summary_output, stats)
analyze_btn.click(
process_text,
inputs=[input_text],
outputs=[corrected_output, grammar_issues, readability_metrics,
passive_voice_output, vocab_suggestions, entity_detection,
formality_tone, text_summary, text_stats]
)
return app
# Create and launch the interface
app = build_interface()
# For Hugging Face Spaces deployment
if __name__ == "__main__":
app.launch()