NLP_Classical / app.py
PR-HARIHARAN's picture
Update app.py
acf666a verified
import streamlit as st
import pickle
import re
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import numpy as np
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
# Load pre-trained models and encoders
with open("MultiOutput_NB.pkl", "rb") as file:
nb_model = pickle.load(file)
with open("MultiOutput_RF.pkl", "rb") as file:
rf_model = pickle.load(file)
with open("vectorizer.pkl", "rb") as file:
vectorizer = pickle.load(file)
with open("category_encoder.pkl", "rb") as file:
label_encoder_category = pickle.load(file)
with open("subcategory_encoder.pkl", "rb") as file:
label_encoder_subcategory = pickle.load(file)
# Load custom stopwords
def load_custom_stopwords(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
words = [line.strip() for line in file if line.strip()]
return set(words)
except FileNotFoundError:
st.error(f"Stopwords file '{file_path}' not found.")
return set()
# Function to preprocess input text
def preprocess_text(text):
# Load stopwords
file_path = 'stop_hinglish.txt'
custom_stopwords = load_custom_stopwords(file_path)
eng_stopwords = set(stopwords.words('english'))
hin_stopwords = set(stopwords.words('hinglish'))
combined_stopwords = eng_stopwords.union(hin_stopwords).union(custom_stopwords)
# Preprocessing steps
text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters
text = text.lower() # Convert to lowercase
words = word_tokenize(text) # Tokenize text
words = [word for word in words if word not in combined_stopwords] # Remove stopwords
# Stemming and lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]
# Join the words back into a single string
processed_text = ' '.join(words)
return processed_text
# Streamlit app
st.title("Crime Category & Subcategory Classification")
st.write("Enter the crime description below to classify its category and sub-category.")
# Model selection dropdown
model_choice = st.selectbox(
"Choose the classification model:",
["Naive Bayes", "Random Forest"]
)
# Input area
user_input = st.text_area("Crime Description:")
if st.button("Classify"):
if user_input.strip(): # Check if input is not empty
# Preprocess the input
cleaned_text = preprocess_text(user_input)
st.write("Cleaned text:", cleaned_text)
# Vectorize the input text
vectorized_text = vectorizer.transform([cleaned_text])
# Additional features placeholder (if needed in the sparse matrix)
additional_features = [[0, 0]] # Placeholder for any numeric features, replace as necessary
# Combine vectorized text with additional features
combined_features = hstack([vectorized_text, additional_features])
# Select the model based on user choice
if model_choice == "Naive Bayes":
model = nb_model
elif model_choice == "Random Forest":
model = rf_model
else:
st.error("Invalid model choice.")
st.stop()
# Predict using the selected model
predictions = model.predict(combined_features)
try:
# Extract indices if predictions are probabilities or multi-output
if predictions.ndim == 3: # Multi-output with separate arrays
category_index = np.argmax(predictions[0]) # First output
subcategory_index = np.argmax(predictions[1]) # Second output
elif predictions.ndim == 2: # Multi-output as a single array
category_index = int(predictions[0][0])
subcategory_index = int(predictions[0][1])
else:
st.error("Unexpected model output shape. Please check your model.")
st.stop()
# Decode predictions
predicted_category = label_encoder_category.inverse_transform([category_index])[0]
predicted_subcategory = label_encoder_subcategory.inverse_transform([subcategory_index])[0]
# Display results
st.success("Classification Results:")
st.write(f"**Category:** {predicted_category}")
st.write(f"**Sub-Category:** {predicted_subcategory}")
except Exception as e:
st.error(f"Error decoding predictions: {e}")
else:
st.error("Please enter a valid crime description.")