Spaces:
Build error
Build error
| import streamlit as st | |
| import pickle | |
| import re | |
| from scipy.sparse import hstack | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import LabelEncoder | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer, WordNetLemmatizer | |
| import nltk | |
| import numpy as np | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('punkt') | |
| nltk.download('punkt_tab') | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| # Load pre-trained models and encoders | |
| with open("MultiOutput_NB.pkl", "rb") as file: | |
| nb_model = pickle.load(file) | |
| with open("MultiOutput_RF.pkl", "rb") as file: | |
| rf_model = pickle.load(file) | |
| with open("vectorizer.pkl", "rb") as file: | |
| vectorizer = pickle.load(file) | |
| with open("category_encoder.pkl", "rb") as file: | |
| label_encoder_category = pickle.load(file) | |
| with open("subcategory_encoder.pkl", "rb") as file: | |
| label_encoder_subcategory = pickle.load(file) | |
| # Load custom stopwords | |
| def load_custom_stopwords(file_path): | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| words = [line.strip() for line in file if line.strip()] | |
| return set(words) | |
| except FileNotFoundError: | |
| st.error(f"Stopwords file '{file_path}' not found.") | |
| return set() | |
| # Function to preprocess input text | |
| def preprocess_text(text): | |
| # Load stopwords | |
| file_path = 'stop_hinglish.txt' | |
| custom_stopwords = load_custom_stopwords(file_path) | |
| eng_stopwords = set(stopwords.words('english')) | |
| hin_stopwords = set(stopwords.words('hinglish')) | |
| combined_stopwords = eng_stopwords.union(hin_stopwords).union(custom_stopwords) | |
| # Preprocessing steps | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters | |
| text = text.lower() # Convert to lowercase | |
| words = word_tokenize(text) # Tokenize text | |
| words = [word for word in words if word not in combined_stopwords] # Remove stopwords | |
| # Stemming and lemmatization | |
| stemmer = PorterStemmer() | |
| lemmatizer = WordNetLemmatizer() | |
| words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words] | |
| # Join the words back into a single string | |
| processed_text = ' '.join(words) | |
| return processed_text | |
| # Streamlit app | |
| st.title("Crime Category & Subcategory Classification") | |
| st.write("Enter the crime description below to classify its category and sub-category.") | |
| # Model selection dropdown | |
| model_choice = st.selectbox( | |
| "Choose the classification model:", | |
| ["Naive Bayes", "Random Forest"] | |
| ) | |
| # Input area | |
| user_input = st.text_area("Crime Description:") | |
| if st.button("Classify"): | |
| if user_input.strip(): # Check if input is not empty | |
| # Preprocess the input | |
| cleaned_text = preprocess_text(user_input) | |
| st.write("Cleaned text:", cleaned_text) | |
| # Vectorize the input text | |
| vectorized_text = vectorizer.transform([cleaned_text]) | |
| # Additional features placeholder (if needed in the sparse matrix) | |
| additional_features = [[0, 0]] # Placeholder for any numeric features, replace as necessary | |
| # Combine vectorized text with additional features | |
| combined_features = hstack([vectorized_text, additional_features]) | |
| # Select the model based on user choice | |
| if model_choice == "Naive Bayes": | |
| model = nb_model | |
| elif model_choice == "Random Forest": | |
| model = rf_model | |
| else: | |
| st.error("Invalid model choice.") | |
| st.stop() | |
| # Predict using the selected model | |
| predictions = model.predict(combined_features) | |
| try: | |
| # Extract indices if predictions are probabilities or multi-output | |
| if predictions.ndim == 3: # Multi-output with separate arrays | |
| category_index = np.argmax(predictions[0]) # First output | |
| subcategory_index = np.argmax(predictions[1]) # Second output | |
| elif predictions.ndim == 2: # Multi-output as a single array | |
| category_index = int(predictions[0][0]) | |
| subcategory_index = int(predictions[0][1]) | |
| else: | |
| st.error("Unexpected model output shape. Please check your model.") | |
| st.stop() | |
| # Decode predictions | |
| predicted_category = label_encoder_category.inverse_transform([category_index])[0] | |
| predicted_subcategory = label_encoder_subcategory.inverse_transform([subcategory_index])[0] | |
| # Display results | |
| st.success("Classification Results:") | |
| st.write(f"**Category:** {predicted_category}") | |
| st.write(f"**Sub-Category:** {predicted_subcategory}") | |
| except Exception as e: | |
| st.error(f"Error decoding predictions: {e}") | |
| else: | |
| st.error("Please enter a valid crime description.") | |