Spaces:

PR-HARIHARAN
/

NLP_Classical

Build error

App Files Files Community

NLP_Classical / app.py

PR-HARIHARAN

Update app.py

acf666a verified over 1 year ago

raw

history blame contribute delete

4.85 kB

	import streamlit as st
	import pickle
	import re
	from scipy.sparse import hstack
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.preprocessing import LabelEncoder
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer, WordNetLemmatizer
	import nltk
	import numpy as np
	nltk.download('averaged_perceptron_tagger')
	nltk.download('punkt')
	nltk.download('punkt_tab')
	nltk.download('stopwords')
	nltk.download('wordnet')


	# Load pre-trained models and encoders
	with open("MultiOutput_NB.pkl", "rb") as file:
	nb_model = pickle.load(file)

	with open("MultiOutput_RF.pkl", "rb") as file:
	rf_model = pickle.load(file)

	with open("vectorizer.pkl", "rb") as file:
	vectorizer = pickle.load(file)

	with open("category_encoder.pkl", "rb") as file:
	label_encoder_category = pickle.load(file)

	with open("subcategory_encoder.pkl", "rb") as file:
	label_encoder_subcategory = pickle.load(file)

	# Load custom stopwords
	def load_custom_stopwords(file_path):
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	words = [line.strip() for line in file if line.strip()]
	return set(words)
	except FileNotFoundError:
	st.error(f"Stopwords file '{file_path}' not found.")
	return set()

	# Function to preprocess input text
	def preprocess_text(text):
	# Load stopwords
	file_path = 'stop_hinglish.txt'
	custom_stopwords = load_custom_stopwords(file_path)
	eng_stopwords = set(stopwords.words('english'))
	hin_stopwords = set(stopwords.words('hinglish'))
	combined_stopwords = eng_stopwords.union(hin_stopwords).union(custom_stopwords)

	# Preprocessing steps
	text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters
	text = text.lower() # Convert to lowercase
	words = word_tokenize(text) # Tokenize text
	words = [word for word in words if word not in combined_stopwords] # Remove stopwords

	# Stemming and lemmatization
	stemmer = PorterStemmer()
	lemmatizer = WordNetLemmatizer()
	words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]

	# Join the words back into a single string
	processed_text = ' '.join(words)
	return processed_text

	# Streamlit app
	st.title("Crime Category & Subcategory Classification")
	st.write("Enter the crime description below to classify its category and sub-category.")

	# Model selection dropdown
	model_choice = st.selectbox(
	"Choose the classification model:",
	["Naive Bayes", "Random Forest"]
	)

	# Input area
	user_input = st.text_area("Crime Description:")

	if st.button("Classify"):
	if user_input.strip(): # Check if input is not empty
	# Preprocess the input
	cleaned_text = preprocess_text(user_input)
	st.write("Cleaned text:", cleaned_text)

	# Vectorize the input text
	vectorized_text = vectorizer.transform([cleaned_text])

	# Additional features placeholder (if needed in the sparse matrix)
	additional_features = [[0, 0]] # Placeholder for any numeric features, replace as necessary

	# Combine vectorized text with additional features
	combined_features = hstack([vectorized_text, additional_features])

	# Select the model based on user choice
	if model_choice == "Naive Bayes":
	model = nb_model
	elif model_choice == "Random Forest":
	model = rf_model
	else:
	st.error("Invalid model choice.")
	st.stop()

	# Predict using the selected model
	predictions = model.predict(combined_features)

	try:
	# Extract indices if predictions are probabilities or multi-output
	if predictions.ndim == 3: # Multi-output with separate arrays
	category_index = np.argmax(predictions[0]) # First output
	subcategory_index = np.argmax(predictions[1]) # Second output
	elif predictions.ndim == 2: # Multi-output as a single array
	category_index = int(predictions[0][0])
	subcategory_index = int(predictions[0][1])
	else:
	st.error("Unexpected model output shape. Please check your model.")
	st.stop()

	# Decode predictions
	predicted_category = label_encoder_category.inverse_transform([category_index])[0]
	predicted_subcategory = label_encoder_subcategory.inverse_transform([subcategory_index])[0]

	# Display results
	st.success("Classification Results:")
	st.write(f"Category: {predicted_category}")
	st.write(f"Sub-Category: {predicted_subcategory}")

	except Exception as e:
	st.error(f"Error decoding predictions: {e}")
	else:
	st.error("Please enter a valid crime description.")