| """ |
| Utility functions for Smart Auto-Complete |
| Provides common functionality for text processing, logging, and validation |
| """ |
|
|
| import html |
| import logging |
| import re |
| import sys |
| import unicodedata |
| from typing import Dict, List, Optional, Tuple |
|
|
|
|
| def setup_logging(level: str = "INFO") -> logging.Logger: |
| """ |
| Set up logging configuration for the application |
| |
| Args: |
| level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) |
| |
| Returns: |
| Configured logger instance |
| """ |
| |
| logger = logging.getLogger("smart_autocomplete") |
| logger.setLevel(getattr(logging, level.upper())) |
|
|
| |
| for handler in logger.handlers[:]: |
| logger.removeHandler(handler) |
|
|
| |
| console_handler = logging.StreamHandler(sys.stdout) |
| console_handler.setLevel(getattr(logging, level.upper())) |
|
|
| |
| formatter = logging.Formatter( |
| "%(asctime)s - %(name)s - %(levelname)s - %(message)s", |
| datefmt="%Y-%m-%d %H:%M:%S", |
| ) |
| console_handler.setFormatter(formatter) |
|
|
| |
| logger.addHandler(console_handler) |
|
|
| return logger |
|
|
|
|
| def sanitize_input(text: str) -> str: |
| """ |
| Sanitize and clean input text for processing |
| |
| Args: |
| text: Raw input text |
| |
| Returns: |
| Cleaned and sanitized text |
| """ |
| if not text: |
| return "" |
|
|
| |
| text = str(text) |
|
|
| |
| text = html.escape(text) |
|
|
| |
| text = unicodedata.normalize("NFKC", text) |
|
|
| |
| text = re.sub(r"\n\s*\n\s*\n", "\n\n", text) |
| text = re.sub(r"[ \t]+", " ", text) |
|
|
| |
| text = "".join(char for char in text if ord(char) >= 32 or char in "\n\t") |
|
|
| |
| text = text.strip() |
|
|
| return text |
|
|
|
|
| def extract_context_hints(text: str) -> Dict[str, any]: |
| """ |
| Extract contextual hints from the input text to improve suggestions |
| |
| Args: |
| text: Input text to analyze |
| |
| Returns: |
| Dictionary containing context hints |
| """ |
| hints = { |
| "length": len(text), |
| "word_count": len(text.split()), |
| "has_greeting": False, |
| "has_signature": False, |
| "has_code_markers": False, |
| "has_questions": False, |
| "tone": "neutral", |
| "language_style": "linkedin", |
| } |
|
|
| text_lower = text.lower() |
|
|
| |
| email_greetings = [ |
| "dear", |
| "hello", |
| "hi", |
| "greetings", |
| "good morning", |
| "good afternoon", |
| ] |
| email_signatures = [ |
| "sincerely", |
| "best regards", |
| "thank you", |
| "yours truly", |
| "kind regards", |
| ] |
|
|
| hints["has_greeting"] = any(greeting in text_lower for greeting in email_greetings) |
| hints["has_signature"] = any( |
| signature in text_lower for signature in email_signatures |
| ) |
|
|
| |
| code_markers = [ |
| "//", |
| "/*", |
| "*/", |
| "#", |
| "def ", |
| "function", |
| "class ", |
| "import ", |
| "from ", |
| ] |
| hints["has_code_markers"] = any(marker in text_lower for marker in code_markers) |
|
|
| |
| hints["has_questions"] = "?" in text or any( |
| q in text_lower for q in ["what", "how", "why", "when", "where", "who"] |
| ) |
|
|
| |
| formal_words = ["please", "kindly", "respectfully", "sincerely", "professional"] |
| casual_words = ["hey", "yeah", "cool", "awesome", "thanks"] |
|
|
| formal_count = sum(1 for word in formal_words if word in text_lower) |
| casual_count = sum(1 for word in casual_words if word in text_lower) |
|
|
| if formal_count > casual_count: |
| hints["tone"] = "formal" |
| elif casual_count > formal_count: |
| hints["tone"] = "casual" |
|
|
| |
| if hints["has_code_markers"]: |
| hints["language_style"] = "technical" |
| elif hints["has_greeting"] or hints["has_signature"]: |
| hints["language_style"] = "business" |
| elif any( |
| creative in text_lower |
| for creative in ["once upon", "story", "character", "plot"] |
| ): |
| hints["language_style"] = "creative" |
|
|
| return hints |
|
|
|
|
| def validate_api_key(api_key: str, provider: str) -> bool: |
| """ |
| Validate API key format for different providers |
| |
| Args: |
| api_key: The API key to validate |
| provider: The provider name (openai, anthropic) |
| |
| Returns: |
| True if the key format is valid, False otherwise |
| """ |
| if not api_key or not isinstance(api_key, str): |
| return False |
|
|
| api_key = api_key.strip() |
|
|
| if provider.lower() == "openai": |
| |
| return api_key.startswith("sk-") and len(api_key) >= 40 |
| elif provider.lower() == "anthropic": |
| |
| return api_key.startswith("sk-ant-") and len(api_key) >= 40 |
|
|
| return False |
|
|
|
|
| def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str: |
| """ |
| Truncate text to a maximum length while optionally preserving word boundaries |
| |
| Args: |
| text: Text to truncate |
| max_length: Maximum allowed length |
| preserve_words: Whether to preserve word boundaries |
| |
| Returns: |
| Truncated text |
| """ |
| if len(text) <= max_length: |
| return text |
|
|
| if not preserve_words: |
| return text[:max_length].rstrip() + "..." |
|
|
| |
| truncated = text[:max_length] |
| last_space = truncated.rfind(" ") |
|
|
| if last_space > max_length * 0.8: |
| return text[:last_space].rstrip() + "..." |
| else: |
| return text[:max_length].rstrip() + "..." |
|
|
|
|
| def format_suggestions_for_display( |
| suggestions: List[str], max_display_length: int = 100 |
| ) -> List[Dict[str, str]]: |
| """ |
| Format suggestions for display in the UI |
| |
| Args: |
| suggestions: List of suggestion strings |
| max_display_length: Maximum length for display |
| |
| Returns: |
| List of formatted suggestion dictionaries |
| """ |
| formatted = [] |
|
|
| for i, suggestion in enumerate(suggestions, 1): |
| |
| clean_suggestion = sanitize_input(suggestion) |
|
|
| |
| display_text = truncate_text(clean_suggestion, max_display_length) |
|
|
| formatted.append( |
| { |
| "id": i, |
| "text": clean_suggestion, |
| "display_text": display_text, |
| "length": len(clean_suggestion), |
| "word_count": len(clean_suggestion.split()), |
| } |
| ) |
|
|
| return formatted |
|
|
|
|
| def calculate_text_similarity(text1: str, text2: str) -> float: |
| """ |
| Calculate similarity between two texts using simple word overlap |
| |
| Args: |
| text1: First text |
| text2: Second text |
| |
| Returns: |
| Similarity score between 0 and 1 |
| """ |
| if not text1 or not text2: |
| return 0.0 |
|
|
| |
| words1 = set(text1.lower().split()) |
| words2 = set(text2.lower().split()) |
|
|
| |
| intersection = len(words1.intersection(words2)) |
| union = len(words1.union(words2)) |
|
|
| return intersection / union if union > 0 else 0.0 |
|
|
|
|
| def get_text_stats(text: str) -> Dict[str, int]: |
| """ |
| Get basic statistics about the text |
| |
| Args: |
| text: Text to analyze |
| |
| Returns: |
| Dictionary with text statistics |
| """ |
| if not text: |
| return {"characters": 0, "words": 0, "sentences": 0, "paragraphs": 0} |
|
|
| |
| char_count = len(text.replace(" ", "").replace("\n", "").replace("\t", "")) |
|
|
| |
| word_count = len(text.split()) |
|
|
| |
| sentence_count = len(re.findall(r"[.!?]+", text)) |
|
|
| |
| paragraph_count = len([p for p in text.split("\n\n") if p.strip()]) |
|
|
| return { |
| "characters": char_count, |
| "words": word_count, |
| "sentences": max(1, sentence_count), |
| "paragraphs": max(1, paragraph_count), |
| } |
|
|