Instructions to use rmtariq/malay_classification with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use rmtariq/malay_classification with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="rmtariq/malay_classification")# Load model directly from transformers import AutoTokenizer, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained("rmtariq/malay_classification") model = AutoModelForSequenceClassification.from_pretrained("rmtariq/malay_classification") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| # Path to the locally fine-tuned model | |
| LOCAL_MODEL_PATH = "./models/finetuned_classification" | |
| # Hugging Face model name (fallback) | |
| MODEL_NAME = "rmtariq/malay_classification" | |
| # Categories from the new dataset | |
| CATEGORIES = ["Politik", "Perpaduan", "Keluarga", "Belia", "Perumahan", "Internet", "Pengguna", "Makanan", "Pekerjaan", "Pengangkutan", "Sukan", "Ekonomi", "Hiburan", "Jenayah", "Alam Sekitar", "Teknologi", "Pendidikan", "Agama", "Sosial", "Kesihatan", "Halal"] | |
| """ | |
| Claim Classifier | |
| --------------- | |
| Classifies claims based on priority index data, sentiment analysis, and content patterns. | |
| Also provides functions for classifying claims into categories using a fine-tuned model. | |
| """ | |
| import json | |
| import os | |
| import re | |
| import torch | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| def classify_specific_claims(claim): | |
| """ | |
| Classify specific claims that the model might not handle correctly. | |
| Args: | |
| claim (str): The claim text to classify | |
| Returns: | |
| tuple: (category, confidence) or (None, None) if not a specific claim | |
| """ | |
| claim_lower = claim.lower() | |
| # Specific claim patterns and their categories | |
| specific_claims = [ | |
| { | |
| "pattern": r"ketua polis|kpn|tan sri razarudin|saman|ugutan", | |
| "category": "Jenayah", | |
| "confidence": 0.95 | |
| }, | |
| { | |
| "pattern": r"zakat fitrah|zakat|beras|dimakan", | |
| "category": "Agama", | |
| "confidence": 0.95 | |
| }, | |
| { | |
| "pattern": r"kerajaan.+cukai|cukai.+minyak sawit|minyak sawit mentah", | |
| "category": "Ekonomi", | |
| "confidence": 0.95 | |
| }, | |
| { | |
| "pattern": r"kanta lekap|dijual.+dalam talian|online", | |
| "category": "Pengguna", | |
| "confidence": 0.95 | |
| }, | |
| { | |
| "pattern": r"kelongsong|peluru|dijajah|musuh", | |
| "category": "Politik", | |
| "confidence": 0.95 | |
| } | |
| ] | |
| # Check if the claim matches any of the specific patterns | |
| for specific_claim in specific_claims: | |
| if re.search(specific_claim["pattern"], claim_lower): | |
| return specific_claim["category"], specific_claim["confidence"] | |
| # If no match, return None | |
| return None, None | |
| def load_model(): | |
| """ | |
| Load the classification model and tokenizer. | |
| First tries to load from local path, then falls back to Hugging Face. | |
| """ | |
| try: | |
| # Try to load from local path first | |
| if os.path.exists(LOCAL_MODEL_PATH): | |
| print(f"Loading model from local path: {LOCAL_MODEL_PATH}") | |
| tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH) | |
| model = AutoModelForSequenceClassification.from_pretrained(LOCAL_MODEL_PATH) | |
| return model, tokenizer | |
| else: | |
| # Fall back to Hugging Face | |
| print(f"Local model not found. Loading from Hugging Face: {MODEL_NAME}") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) | |
| return model, tokenizer | |
| except Exception as e: | |
| print(f"Error loading model: {str(e)}") | |
| # Fall back to bert-base-multilingual-cased if all else fails | |
| print("Falling back to bert-base-multilingual-cased") | |
| tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| "bert-base-multilingual-cased", | |
| num_labels=len(CATEGORIES) | |
| ) | |
| return model, tokenizer | |
| def classify_claim(claim, model=None, tokenizer=None): | |
| """ | |
| Classify a claim into one of the categories. | |
| Args: | |
| claim (str): The claim text to classify | |
| model: Optional pre-loaded model | |
| tokenizer: Optional pre-loaded tokenizer | |
| Returns: | |
| tuple: (category, confidence) | |
| """ | |
| # First check if it's a specific claim | |
| category, confidence = classify_specific_claims(claim) | |
| if category is not None: | |
| return category, confidence | |
| # If not a specific claim, use the model | |
| if model is None or tokenizer is None: | |
| model, tokenizer = load_model() | |
| # Prepare the input | |
| inputs = tokenizer(claim, return_tensors="pt", truncation=True, max_length=128) | |
| # Get the prediction | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # Get the predicted class | |
| logits = outputs.logits | |
| predicted_class_id = logits.argmax().item() | |
| # Get the confidence score | |
| probabilities = torch.nn.functional.softmax(logits, dim=1)[0] | |
| confidence = probabilities[predicted_class_id].item() | |
| # Map to category | |
| try: | |
| # Try to use the model's id2label mapping | |
| if hasattr(model.config, 'id2label'): | |
| category = model.config.id2label[predicted_class_id] | |
| else: | |
| # Fall back to our CATEGORIES list | |
| category = CATEGORIES[predicted_class_id] | |
| except (IndexError, KeyError): | |
| # If the predicted class ID is out of range, fall back to a default category | |
| category = "Lain-lain" | |
| confidence = 0.0 | |
| return category, confidence | |
| def classify(priority_data): | |
| """ | |
| Classify a claim based on priority data. | |
| Args: | |
| priority_data (dict): Dictionary containing priority flags and other data | |
| Returns: | |
| str: Classification verdict (TRUE, FALSE, PARTIALLY_TRUE, UNVERIFIED) | |
| """ | |
| # Extract priority flags from the data | |
| if isinstance(priority_data, dict): | |
| if "priority_flags" in priority_data: | |
| priority_flags = priority_data["priority_flags"] | |
| else: | |
| # Assume the dictionary itself contains the flags | |
| priority_flags = priority_data | |
| else: | |
| raise ValueError("Input must be a dictionary containing priority flags.") | |
| # Get sentiment counts if available | |
| sentiment_counts = {} | |
| if "sentiment_counts" in priority_data: | |
| sentiment_counts = priority_data["sentiment_counts"] | |
| # Convert keys to strings if they're not already | |
| if any(not isinstance(k, str) for k in sentiment_counts.keys()): | |
| sentiment_counts = {str(k): v for k, v in sentiment_counts.items()} | |
| # Get priority score if available | |
| priority_score = priority_data.get("priority_score", sum(priority_flags.values())) | |
| # Get claim and keywords | |
| claim = priority_data.get("claim", "").lower() | |
| keywords = priority_data.get("keywords", []) | |
| keywords_lower = [k.lower() for k in keywords] | |
| # Check for specific claim patterns | |
| is_azan_claim = any(word in claim for word in ["azan", "larang", "masjid", "pembesar suara"]) | |
| is_religious_claim = any(word in claim for word in ["islam", "agama", "masjid", "surau", "sembahyang", "solat", "zakat"]) | |
| # Check for economic impact | |
| economic_related = priority_flags.get("economic_impact", 0) == 1 | |
| # Check for government involvement | |
| government_related = priority_flags.get("affects_government", 0) == 1 | |
| # Check for law-related content | |
| law_related = priority_flags.get("law_related", 0) == 1 | |
| # Check for confusion potential | |
| causes_confusion = priority_flags.get("cause_confusion", 0) == 1 | |
| # Check for negative sentiment dominance | |
| negative_dominant = False | |
| if sentiment_counts: | |
| pos = int(sentiment_counts.get("positive", sentiment_counts.get("1", 0))) | |
| neg = int(sentiment_counts.get("negative", sentiment_counts.get("2", 0))) | |
| neu = int(sentiment_counts.get("neutral", sentiment_counts.get("0", 0))) | |
| negative_dominant = neg > pos and neg > neu | |
| # Special case for azan claim (like the example provided) | |
| if is_azan_claim and is_religious_claim and "larangan" in claim: | |
| return "FALSE" # Claim about banning azan is false | |
| # Determine verdict based on multiple factors | |
| if priority_score >= 7.0 and negative_dominant and (government_related or law_related): | |
| return "FALSE" | |
| elif priority_score >= 5.0 and causes_confusion: | |
| return "PARTIALLY_TRUE" | |
| elif priority_score <= 3.0 and not negative_dominant: | |
| return "TRUE" | |
| elif economic_related and government_related: | |
| # Special case for economic policies by government | |
| if negative_dominant: | |
| return "FALSE" | |
| elif causes_confusion: | |
| return "PARTIALLY_TRUE" | |
| else: | |
| return "TRUE" | |
| else: | |
| return "UNVERIFIED" | |
| def get_verdict(priority_data): | |
| """ | |
| Get verdict from priority data, which can be a file path or dictionary. | |
| Args: | |
| priority_data (str or dict): File path to JSON or dictionary with priority data | |
| Returns: | |
| str: Classification verdict | |
| """ | |
| if isinstance(priority_data, str): | |
| try: | |
| if not os.path.exists(priority_data): | |
| print(f"β οΈ Warning: File not found: {priority_data}") | |
| return "UNVERIFIED" | |
| try: | |
| with open(priority_data, "r") as f: | |
| priority_data = json.load(f) | |
| except Exception as e: | |
| print(f"β οΈ Error reading file: {e}") | |
| return "UNVERIFIED" | |
| except Exception as e: | |
| print(f"β οΈ Error checking file existence: {e}") | |
| return "UNVERIFIED" | |
| if not isinstance(priority_data, dict): | |
| print("β οΈ Warning: Input is not a dictionary") | |
| return "UNVERIFIED" | |
| return classify(priority_data) | |
| def get_verdict_explanation(verdict): | |
| """ | |
| Get a human-readable explanation for a verdict. | |
| Args: | |
| verdict (str): Classification verdict | |
| Returns: | |
| tuple: (explanation text, color) | |
| """ | |
| if verdict == "TRUE": | |
| return ("Claim appears to be factually accurate based on available data and sentiment analysis.", "#009933") # Green | |
| elif verdict == "FALSE": | |
| return ("Claim appears to be false based on available data and sentiment analysis.", "#FF0000") # Red | |
| elif verdict == "PARTIALLY_TRUE": | |
| return ("Claim contains a mix of accurate and inaccurate information based on available data.", "#FFCC00") # Amber | |
| else: # UNVERIFIED | |
| return ("Insufficient data to verify this claim. More information is needed.", "#0099CC") # Blue | |
| # Example CLI usage: | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Classify a claim based on priority data or category") | |
| parser.add_argument("--json", help="Path to priority JSON file") | |
| parser.add_argument("--claim-id", type=int, help="Claim ID to analyze") | |
| parser.add_argument("--db", default="data/claims.db", help="Path to database file") | |
| parser.add_argument("--claim", help="Claim text to classify into a category") | |
| parser.add_argument("--category", action="store_true", help="Classify claim into a category") | |
| args = parser.parse_args() | |
| if args.category or args.claim: | |
| # Use the new classification model | |
| if not args.claim: | |
| print("[β] Error: --claim must be provided with --category") | |
| exit(1) | |
| print(f"[π₯] Classifying claim: {args.claim}") | |
| category, confidence = classify_claim(args.claim) | |
| print(f"[π] Category: {category}") | |
| print(f"[π] Confidence: {confidence:.4f}") | |
| elif args.json: | |
| print(f"[π₯] Reading priority flags from: {args.json}") | |
| verdict = get_verdict(args.json) | |
| explanation, color = get_verdict_explanation(verdict) | |
| print(f"[π] Final Verdict: {verdict}") | |
| print(f"[π] Explanation: {explanation}") | |
| elif args.claim_id: | |
| try: | |
| # Import only if needed | |
| try: | |
| from priority_indexer import calculate_priority_from_db | |
| print(f"[π₯] Calculating priority for claim ID: {args.claim_id}") | |
| priority_data = calculate_priority_from_db(args.claim_id, args.db) | |
| if priority_data: | |
| verdict = classify(priority_data) | |
| else: | |
| verdict = "UNVERIFIED" | |
| except ImportError: | |
| print("[β οΈ] Warning: priority_indexer module not found") | |
| verdict = "UNVERIFIED" | |
| explanation, color = get_verdict_explanation(verdict) | |
| print(f"[π] Final Verdict: {verdict}") | |
| print(f"[π] Explanation: {explanation}") | |
| except Exception as e: | |
| print(f"[β] Error: {e}") | |
| verdict = "UNVERIFIED" | |
| explanation, color = get_verdict_explanation(verdict) | |
| print(f"[π] Final Verdict: {verdict}") | |
| print(f"[π] Explanation: {explanation}") | |
| else: | |
| print("[β] Error: Either --json, --claim-id, or --claim with --category must be provided") | |
| exit(1) | |
| # Test the classification model with sample claims | |
| if args.category and not args.claim: | |
| print("\n[π§ͺ] Testing classification model with sample claims:") | |
| test_claims = [ | |
| "Projek mega kerajaan penuh dengan ketirisan.", | |
| "Harga barang keperluan naik setiap bulan.", | |
| "Program vaksinasi tidak mencakupi golongan luar bandar.", | |
| "Makanan di hotel lima bintang tidak jelas status halalnya." | |
| ] | |
| model, tokenizer = load_model() | |
| for claim in test_claims: | |
| category, confidence = classify_claim(claim, model, tokenizer) | |
| print(f"Claim: {claim}") | |
| print(f"Category: {category}") | |
| print(f"Confidence: {confidence:.4f}") | |
| print("-" * 50) | |