Spaces:

kmondal91
/

Grammerly

Sleeping

App Files Files Community

Grammerly / app.py

kmondal91

Create app.py

ba312e6 verified 9 months ago

raw

history blame contribute delete

25.4 kB

	import gradio as gr
	import re
	import nltk
	from nltk.corpus import wordnet
	from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
	import pandas as pd
	from textstat import textstat
	import spacy
	import requests
	from time import sleep
	import json
	import torch

	# Download necessary NLTK data
	try:
	nltk.download('wordnet', quiet=True)
	nltk.download('punkt', quiet=True)
	nltk.download('averaged_perceptron_tagger', quiet=True)
	except:
	print("NLTK data download failed. Some features may be limited.")

	# Load NER model for entity detection
	try:
	nlp = spacy.load("en_core_web_sm")
	except:
	try:
	spacy.cli.download("en_core_web_sm")
	nlp = spacy.load("en_core_web_sm")
	except:
	print("Spacy model loading failed. Entity recognition will be limited.")
	nlp = None

	# Load sentiment analysis pipeline
	try:
	sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
	except:
	print("Sentiment analyzer loading failed. Sentiment analysis will be disabled.")
	sentiment_analyzer = None

	# Load grammar correction model
	try:
	grammar_model_name = "pszemraj/flan-t5-large-grammar-synthesis"
	grammar_tokenizer = AutoTokenizer.from_pretrained(grammar_model_name)
	grammar_model = AutoModelForSeq2SeqLM.from_pretrained(grammar_model_name)
	except:
	print("Grammar correction model loading failed. Will use alternative methods.")
	grammar_model = None
	grammar_tokenizer = None

	# Load text summarization model
	try:
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	except:
	print("Summarization model loading failed. Summarization will be disabled.")
	summarizer = None

	def get_synonyms(word):
	"""Get synonyms for a word using WordNet"""
	synonyms = set()
	try:
	for syn in wordnet.synsets(word):
	for lemma in syn.lemmas():
	synonyms.add(lemma.name().replace('_', ' '))
	return list(synonyms)[:5] # Return up to 5 synonyms
	except:
	return []

	def correct_grammar_with_model(text, max_length=512):
	"""Use a transformer model to correct grammar"""
	if not grammar_model or not grammar_tokenizer:
	return text

	# Split text into chunks if too long
	chunks = []
	sentences = nltk.sent_tokenize(text)
	current_chunk = ""

	for sentence in sentences:
	# If adding this sentence would make the chunk too long, save current chunk and start a new one
	if len(grammar_tokenizer.encode(current_chunk + " " + sentence)) > max_length:
	chunks.append(current_chunk)
	current_chunk = sentence
	else:
	if current_chunk:
	current_chunk += " " + sentence
	else:
	current_chunk = sentence

	# Add the last chunk if not empty
	if current_chunk:
	chunks.append(current_chunk)

	# Process each chunk
	corrected_chunks = []
	for chunk in chunks:
	# Skip empty chunks
	if not chunk.strip():
	continue

	inputs = grammar_tokenizer(f"grammar: {chunk}", return_tensors="pt", truncation=True, max_length=max_length)

	with torch.no_grad():
	outputs = grammar_model.generate(
	inputs.input_ids,
	max_length=max_length,
	num_beams=5,
	early_stopping=True
	)

	corrected = grammar_tokenizer.decode(outputs[0], skip_special_tokens=True)
	corrected_chunks.append(corrected)

	return " ".join(corrected_chunks)

	def find_grammar_issues(original_text, corrected_text):
	"""Identify differences between original and corrected text"""
	issues = []

	# Use simple tokenization to compare texts
	original_sentences = nltk.sent_tokenize(original_text)
	corrected_sentences = nltk.sent_tokenize(corrected_text)

	# Match up sentences and find differences
	min_len = min(len(original_sentences), len(corrected_sentences))

	for i in range(min_len):
	if original_sentences[i] != corrected_sentences[i]:
	issues.append({
	"original": original_sentences[i],
	"corrected": corrected_sentences[i],
	"position": original_text.find(original_sentences[i])
	})

	return issues

	def calculate_readability_metrics(text):
	"""Calculate various readability metrics"""
	if not text.strip():
	return {}

	try:
	return {
	"flesch_reading_ease": textstat.flesch_reading_ease(text),
	"flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
	"gunning_fog": textstat.gunning_fog(text),
	"smog_index": textstat.smog_index(text),
	"automated_readability_index": textstat.automated_readability_index(text),
	"coleman_liau_index": textstat.coleman_liau_index(text),
	"reading_time": f"{textstat.reading_time(text, ms_per_char=14):.1f} seconds"
	}
	except:
	return {"error": "Readability calculation failed"}

	def find_repeated_words(text):
	"""Find repeated words in close proximity"""
	words = text.lower().split()
	repeated = []

	for i in range(len(words) - 5):
	window = words[i:i+5]
	for word in set(window):
	if len(word) > 3 and window.count(word) > 1: # Only consider words longer than 3 chars
	repeated.append(word)

	return list(set(repeated))

	def identify_passive_voice(text):
	"""Identify potential passive voice usage"""
	# Simple pattern matching for common passive voice constructions
	passive_patterns = [
	r'\b(?:am\|is\|are\|was\|were\|be\|being\|been)\s+(\w+ed)\b',
	r'\b(?:am\|is\|are\|was\|were\|be\|being\|been)\s+(\w+en)\b'
	]

	passive_instances = []
	for pattern in passive_patterns:
	matches = re.finditer(pattern, text, re.IGNORECASE)
	for match in matches:
	start = max(0, match.start() - 20)
	end = min(len(text), match.end() + 20)
	context = text[start:end]
	passive_instances.append({
	"match": match.group(0),
	"context": context,
	"position": match.start()
	})

	return passive_instances

	def analyze_sentiment(text):
	"""Analyze sentiment of the text"""
	if not sentiment_analyzer or len(text.strip()) < 5: # Skip very short text
	return {"label": "N/A", "score": 0}

	try:
	result = sentiment_analyzer(text[:512])[0] # Limit text length for the model
	return result
	except:
	return {"label": "Error", "score": 0}

	def extract_entities(text):
	"""Extract named entities from text"""
	if not nlp:
	return []

	entities = []
	try:
	# Process text in chunks if it's too long
	max_chars = 100000 # spaCy default max length
	if len(text) > max_chars:
	chunks = [text[i:i+max_chars] for i in range(0, len(text), max_chars)]
	else:
	chunks = [text]

	for chunk in chunks:
	doc = nlp(chunk)
	for ent in doc.ents:
	entities.append({
	"text": ent.text,
	"label": ent.label_,
	"start": ent.start_char,
	"end": ent.end_char
	})
	except:
	pass

	return entities

	def suggest_simpler_vocabulary(text):
	"""Suggest simpler alternatives for complex words"""
	# This is a simplified implementation
	complex_words = {
	"utilize": "use",
	"implement": "use",
	"facilitate": "help",
	"leverage": "use",
	"optimize": "improve",
	"commence": "start",
	"terminate": "end",
	"endeavor": "try",
	"cognizant": "aware",
	"prioritize": "focus on",
	"ascertain": "find out",
	"subsequent": "later",
	"initiate": "start",
	"finalize": "finish",
	"abundant": "many",
	"adequate": "enough",
	"demonstrate": "show",
	"encounter": "meet",
	"generate": "create",
	"observe": "see",
	"obtain": "get",
	"require": "need",
	"sufficient": "enough",
	"utilize": "use",
	"endeavour": "try",
	"comprehend": "understand",
	"procure": "get",
	"inquire": "ask",
	"commence": "begin",
	"purchase": "buy",
	"assist": "help"
	}

	suggestions = {}
	for word, replacement in complex_words.items():
	if re.search(r'\b' + word + r'\b', text, re.IGNORECASE):
	suggestions[word] = replacement

	return suggestions

	def summarize_text(text, max_length=150, min_length=40):
	"""Summarize the text using a pre-trained model"""
	if not summarizer or len(text.split()) < 30: # Don't summarize short text
	return "Text is too short for summarization"

	try:
	# Split into chunks if text is too long
	max_chunk_length = 1024 # Most summarization models have limits
	if len(text.split()) > max_chunk_length:
	sentences = nltk.sent_tokenize(text)
	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
	sentence_length = len(sentence.split())
	if current_length + sentence_length <= max_chunk_length:
	current_chunk.append(sentence)
	current_length += sentence_length
	else:
	chunks.append(" ".join(current_chunk))
	current_chunk = [sentence]
	current_length = sentence_length

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	# Summarize each chunk and combine
	summaries = []
	for chunk in chunks:
	summary = summarizer(chunk, max_length=min(max_length, len(chunk.split())),
	min_length=min(min_length, len(chunk.split())//2),
	do_sample=False)[0]['summary_text']
	summaries.append(summary)

	return " ".join(summaries)
	else:
	return summarizer(text, max_length=max_length, min_length=min_length,
	do_sample=False)[0]['summary_text']
	except Exception as e:
	return f"Summarization failed: {str(e)}"

	def analyze_formality(text):
	"""Analyze the formality level of the text"""
	# Simple heuristics-based formality analysis
	formal_indicators = [
	r'\b(?:however\|therefore\|thus\|consequently\|furthermore\|moreover\|nevertheless)\b',
	r'\b(?:shall\|ought\|whom\|whereby\|herein\|therein\|wherein)\b',
	r'\b(?:Mr\.\|Mrs\.\|Ms\.\|Dr\.\|Prof\.)\b',
	r'\b(?:would like to\|I am writing to)\b'
	]

	informal_indicators = [
	r'\b(?:yeah\|nope\|gonna\|wanna\|gotta\|kinda\|sorta)\b',
	r'(?:!{2,}\|\?{2,})',
	r'\b(?:lol\|omg\|btw\|imo\|tbh)\b',
	r"(?:don't\|won't\|can't\|shouldn't\|wouldn't\|isn't\|aren't\|haven't)",
	r'\b(?:awesome\|cool\|super\|great\|huge)\b'
	]

	formal_count = 0
	for pattern in formal_indicators:
	formal_count += len(re.findall(pattern, text, re.IGNORECASE))

	informal_count = 0
	for pattern in informal_indicators:
	informal_count += len(re.findall(pattern, text, re.IGNORECASE))

	# Calculate formality score (simple version)
	word_count = len(text.split())
	if word_count == 0:
	return {"formality_level": "Unknown", "score": 0.5}

	formal_ratio = formal_count / max(1, word_count)
	informal_ratio = informal_count / max(1, word_count)

	# Determine formality level
	if formal_ratio > 0.05 and formal_ratio > informal_ratio * 2:
	formality = "Formal"
	score = min(0.9, 0.5 + formal_ratio * 5)
	elif informal_ratio > 0.05 and informal_ratio > formal_ratio * 2:
	formality = "Informal"
	score = max(0.1, 0.5 - informal_ratio * 5)
	else:
	formality = "Neutral"
	score = 0.5

	return {"formality_level": formality, "score": score}

	def detect_tone(text):
	"""Detect the overall tone of the text"""
	if not text.strip():
	return "Neutral"

	# Simple keyword-based tone detection
	tone_keywords = {
	"Professional": ["recommend", "inform", "request", "provide", "consider", "suggest", "propose", "analyze", "evaluate", "conclude"],
	"Academic": ["research", "study", "analysis", "theory", "hypothesis", "methodology", "findings", "literature", "experiment", "data"],
	"Friendly": ["thanks", "appreciate", "happy", "glad", "hope", "welcome", "please", "enjoy", "share", "connect"],
	"Persuasive": ["should", "must", "need", "important", "crucial", "essential", "significant", "consider", "believe", "argue"],
	"Urgent": ["immediately", "urgent", "asap", "quickly", "soon", "deadline", "critical", "emergency", "promptly", "hurry"],
	"Cautious": ["perhaps", "might", "may", "possible", "potentially", "suggest", "consider", "could", "seems", "appears"]
	}

	tone_scores = {tone: 0 for tone in tone_keywords}
	word_count = len(text.split())

	# Count occurrences of tone keywords
	for tone, keywords in tone_keywords.items():
	for keyword in keywords:
	tone_scores[tone] += len(re.findall(r'\b' + keyword + r'\b', text, re.IGNORECASE))

	# Normalize by word count
	for tone in tone_scores:
	tone_scores[tone] = tone_scores[tone] / max(1, word_count)

	# Find the most dominant tone
	dominant_tone = max(tone_scores.items(), key=lambda x: x[1])

	# Only return a specific tone if it's significantly present
	if dominant_tone[1] > 0.02:
	return dominant_tone[0]
	else:
	return "Neutral"

	def text_analysis(text):
	"""Comprehensive text analysis"""
	if not text.strip():
	return {
	"grammar_issues": [],
	"corrected_text": "",
	"readability": {},
	"repeated_words": [],
	"passive_voice": [],
	"sentiment": {"label": "N/A", "score": 0},
	"entities": [],
	"simpler_vocabulary": {},
	"formality": {"formality_level": "Unknown", "score": 0.5},
	"tone": "Neutral",
	"summary": "",
	"word_count": 0,
	"sentence_count": 0,
	"average_sentence_length": 0
	}

	# Basic text stats
	word_count = len(text.split())
	sentences = nltk.sent_tokenize(text)
	sentence_count = len(sentences)
	avg_sentence_length = word_count / max(sentence_count, 1)

	# Correct grammar with AI model
	corrected_text = correct_grammar_with_model(text)

	# Find grammar issues by comparing original and corrected text
	grammar_issues = find_grammar_issues(text, corrected_text)

	# Run all analysis functions
	readability = calculate_readability_metrics(text)
	repeated_words = find_repeated_words(text)
	passive_voice = identify_passive_voice(text)
	sentiment = analyze_sentiment(text)
	entities = extract_entities(text)
	simpler_words = suggest_simpler_vocabulary(text)
	formality = analyze_formality(text)
	tone = detect_tone(text)

	# Generate summary for longer text
	summary = ""
	if word_count > 50:
	summary = summarize_text(text)

	return {
	"grammar_issues": grammar_issues,
	"corrected_text": corrected_text,
	"readability": readability,
	"repeated_words": repeated_words,
	"passive_voice": passive_voice,
	"sentiment": sentiment,
	"entities": entities,
	"simpler_vocabulary": simpler_words,
	"formality": formality,
	"tone": tone,
	"summary": summary,
	"word_count": word_count,
	"sentence_count": sentence_count,
	"average_sentence_length": avg_sentence_length
	}

	def format_grammar_issues(issues):
	if not issues:
	return "No grammar issues found."

	result = "Grammar Issues Found:\n\n"
	for i, issue in enumerate(issues, 1):
	result += f"{i}. Original: \"{issue['original']}\"\n"
	result += f" Corrected: \"{issue['corrected']}\"\n\n"

	return result

	def format_readability(metrics):
	if not metrics:
	return "Readability metrics not available."

	if "error" in metrics:
	return f"Error: {metrics['error']}"

	# Define interpretations for Flesch Reading Ease
	def interpret_flesch(score):
	if score >= 90: return "Very Easy (5th grade)"
	elif score >= 80: return "Easy (6th grade)"
	elif score >= 70: return "Fairly Easy (7th grade)"
	elif score >= 60: return "Standard (8th-9th grade)"
	elif score >= 50: return "Fairly Difficult (10th-12th grade)"
	elif score >= 30: return "Difficult (College)"
	else: return "Very Difficult (College Graduate)"

	result = "Readability Analysis:\n\n"
	result += f"• Flesch Reading Ease: {metrics['flesch_reading_ease']:.1f} - {interpret_flesch(metrics['flesch_reading_ease'])}\n"
	result += f"• Flesch-Kincaid Grade Level: {metrics['flesch_kincaid_grade']:.1f}\n"
	result += f"• Gunning Fog Index: {metrics['gunning_fog']:.1f}\n"
	result += f"• SMOG Index: {metrics['smog_index']:.1f}\n"
	result += f"• Automated Readability Index: {metrics['automated_readability_index']:.1f}\n"
	result += f"• Coleman-Liau Index: {metrics['coleman_liau_index']:.1f}\n"
	result += f"• Estimated Reading Time: {metrics['reading_time']}"

	return result

	def format_passive_voice(passive_instances):
	if not passive_instances:
	return "No passive voice detected."

	result = f"Passive Voice Detected ({len(passive_instances)} instances):\n\n"
	for i, instance in enumerate(passive_instances[:5], 1): # Show up to 5 examples
	result += f"{i}. \"...{instance['context']}...\"\n"

	if len(passive_instances) > 5:
	result += f"\nand {len(passive_instances) - 5} more..."

	return result

	def format_entities(entities):
	if not entities:
	return "No named entities detected."

	# Group entities by type
	entity_groups = {}
	for entity in entities:
	if entity['label'] not in entity_groups:
	entity_groups[entity['label']] = []
	entity_groups[entity['label']].append(entity['text'])

	result = "Named Entities Detected:\n\n"
	for label, items in entity_groups.items():
	unique_items = list(set(items))[:5] # Show up to 5 unique entities per type
	result += f"• {label}: {', '.join(unique_items)}"
	if len(set(items)) > 5:
	result += f" and {len(set(items)) - 5} more"
	result += "\n"

	return result

	def format_vocabulary_suggestions(suggestions):
	if not suggestions:
	return "No vocabulary simplification suggestions."

	result = "Vocabulary Simplification Suggestions:\n\n"
	for complex_word, simple_word in suggestions.items():
	result += f"• \"{complex_word}\" → \"{simple_word}\"\n"

	return result

	def build_interface():
	with gr.Blocks(title="AI Grammar & Style Assistant", theme=gr.themes.Soft()) as app:
	gr.Markdown("# 📝 AI Grammar & Style Assistant")
	gr.Markdown("Powered by AI to help improve your writing with advanced grammar checking, style suggestions, and more!")

	with gr.Tab("Text Analysis"):
	with gr.Row():
	with gr.Column(scale=3):
	input_text = gr.Textbox(
	label="Enter your text here",
	placeholder="Type or paste your text here for analysis...",
	lines=10
	)
	analyze_btn = gr.Button("Analyze Text", variant="primary")

	with gr.Column(scale=3):
	corrected_output = gr.Textbox(label="Corrected Text", lines=10)

	with gr.Row():
	with gr.Column():
	grammar_issues = gr.Textbox(label="Grammar Issues", lines=6)
	readability_metrics = gr.Textbox(label="Readability Analysis", lines=10)

	with gr.Column():
	passive_voice_output = gr.Textbox(label="Passive Voice Detection", lines=6)
	vocab_suggestions = gr.Textbox(label="Vocabulary Suggestions", lines=6)

	with gr.Row():
	with gr.Column():
	entity_detection = gr.Textbox(label="Entity Detection", lines=6)

	with gr.Column():
	formality_tone = gr.Textbox(label="Formality & Tone Analysis", lines=6)

	with gr.Row():
	text_summary = gr.Textbox(label="Text Summary", lines=4)

	with gr.Row():
	text_stats = gr.JSON(label="Text Statistics")

	with gr.Tab("Help & Information"):
	gr.Markdown("""
	## How to Use This Tool

	1. Enter or paste your text in the input box
	2. Click "Analyze Text"
	3. Review the analysis results across all categories

	## Features

	- Grammar Correction: AI-powered grammar correction using advanced language models
	- Readability Analysis: Multiple readability metrics including Flesch Reading Ease, Gunning Fog, and more
	- Style Improvement: Detects passive voice, repeated words, and complex vocabulary
	- Named Entity Recognition: Identifies people, organizations, locations, and more
	- Sentiment Analysis: Detects the emotional tone of your text
	- Formality Analysis: Determines if your text is formal, neutral, or informal
	- Text Summarization: Creates a concise summary of longer texts
	- Tone Detection: Identifies the overall tone (professional, academic, friendly, etc.)

	## About

	This is an AI-powered writing assistant similar to Grammarly, built with Python, Gradio, and Hugging Face transformer models.
	""")

	def process_text(text):
	"""Process the input text and return all analysis results"""
	if not text.strip():
	return ("", "No text to analyze.", "No text to analyze.", "No text to analyze.",
	"No text to analyze.", "No text to analyze.", "No text to analyze.", "No text to analyze.",
	{})

	# Perform comprehensive analysis
	results = text_analysis(text)

	# Format corrected text
	corrected = results["corrected_text"] if results["corrected_text"] else text

	# Format grammar issues
	grammar_output = format_grammar_issues(results["grammar_issues"])

	# Format readability metrics
	readability_output = format_readability(results["readability"])

	# Format passive voice detection
	passive_output = format_passive_voice(results["passive_voice"])

	# Format entity detection
	entities_output = format_entities(results["entities"])

	# Format vocabulary suggestions
	vocab_output = format_vocabulary_suggestions(results["simpler_vocabulary"])

	# Format formality and tone
	formality_tone_output = f"Formality: {results['formality']['formality_level']} (Score: {results['formality']['score']:.2f})\nTone: {results['tone']}"

	# Format summary
	summary_output = results["summary"] if results["summary"] else "Summary not available for this text."

	# Format text statistics
	stats = {
	"Word Count": results["word_count"],
	"Sentence Count": results["sentence_count"],
	"Average Sentence Length": f"{results['average_sentence_length']:.1f} words",
	"Repeated Words": results["repeated_words"],
	"Sentiment": f"{results['sentiment']['label']} (Score: {results['sentiment']['score']:.2f})"
	}

	return (corrected, grammar_output, readability_output, passive_output,
	vocab_output, entities_output, formality_tone_output, summary_output, stats)

	analyze_btn.click(
	process_text,
	inputs=[input_text],
	outputs=[corrected_output, grammar_issues, readability_metrics,
	passive_voice_output, vocab_suggestions, entity_detection,
	formality_tone, text_summary, text_stats]
	)

	return app

	# Create and launch the interface
	app = build_interface()

	# For Hugging Face Spaces deployment
	if __name__ == "__main__":
	app.launch()