| | """ |
| | Unit tests for features.py module. |
| | |
| | Tests individual functions for text cleaning, feature extraction, |
| | and label preparation. |
| | """ |
| | import pytest |
| | import numpy as np |
| | import pandas as pd |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| |
|
| | from hopcroft_skill_classification_tool_competition.features import ( |
| | clean_github_text, |
| | get_text_columns, |
| | get_label_columns, |
| | combine_text_fields, |
| | extract_tfidf_features, |
| | prepare_labels, |
| | get_dataset_info, |
| | load_data_from_db, |
| | ) |
| |
|
| |
|
| | @pytest.mark.unit |
| | class TestTextCleaning: |
| | """Unit tests for text cleaning functionality.""" |
| | |
| | def test_clean_github_text_removes_urls(self): |
| | """Test that URLs are removed from text.""" |
| | text = "Fixed bug https://github.com/repo/issues/123 in authentication" |
| | cleaned = clean_github_text(text) |
| | |
| | assert "https://" not in cleaned |
| | assert "github.com" not in cleaned |
| | assert "fix" in cleaned.lower() |
| | assert "authent" in cleaned.lower() |
| | |
| | def test_clean_github_text_removes_html(self): |
| | """Test that HTML tags are removed.""" |
| | text = "Added <b>bold</b> feature with <i>italic</i> text" |
| | cleaned = clean_github_text(text) |
| | |
| | assert "<b>" not in cleaned |
| | assert "<i>" not in cleaned |
| | assert "bold" in cleaned.lower() |
| | |
| | assert "ital" in cleaned.lower() |
| | |
| | def test_clean_github_text_removes_code_blocks(self): |
| | """Test that markdown code blocks are removed.""" |
| | text = """Fixed bug in code: |
| | ```python |
| | def foo(): |
| | pass |
| | ``` |
| | """ |
| | cleaned = clean_github_text(text) |
| | |
| | assert "```" not in cleaned |
| | assert "python" not in cleaned |
| | assert "def" not in cleaned |
| | assert "fix" in cleaned.lower() |
| | |
| | def test_clean_github_text_removes_inline_code(self): |
| | """Test that inline code markers are removed.""" |
| | text = "Updated `getUserById()` method implementation" |
| | cleaned = clean_github_text(text) |
| | |
| | assert "`" not in cleaned |
| | assert "method" in cleaned.lower() |
| | |
| | def test_clean_github_text_normalizes_whitespace(self): |
| | """Test that extra whitespace is normalized.""" |
| | text = "Fixed multiple spaces and\n\n\nnewlines" |
| | cleaned = clean_github_text(text) |
| | |
| | assert " " not in cleaned |
| | assert "\n\n" not in cleaned |
| | |
| | words = cleaned.split() |
| | assert len(words) == len([w for w in words if w]) |
| | |
| | @pytest.mark.parametrize("text,expected_empty", [ |
| | ("", True), |
| | (None, True), |
| | (" ", True), |
| | ("\n\n", True), |
| | ("a", False), |
| | ]) |
| | def test_clean_github_text_empty_inputs(self, text, expected_empty): |
| | """Test handling of empty or null inputs.""" |
| | cleaned = clean_github_text(text) |
| | assert isinstance(cleaned, str) |
| | |
| | if expected_empty: |
| | assert cleaned == "" or cleaned.isspace() |
| | else: |
| | assert len(cleaned) > 0 |
| | |
| | def test_clean_github_text_applies_stemming(self): |
| | """Test that stemming is applied to words.""" |
| | text = "running walked swimming" |
| | cleaned = clean_github_text(text) |
| | |
| | |
| | assert "run" in cleaned.lower() |
| | assert "walk" in cleaned.lower() |
| | assert "swim" in cleaned.lower() |
| | |
| | def test_clean_github_text_removes_emojis(self): |
| | """Test that emojis and non-ASCII characters are removed.""" |
| | text = "Fixed bug 😀 with special chars" |
| | cleaned = clean_github_text(text) |
| | |
| | |
| | assert cleaned.isascii() |
| | assert "fix" in cleaned.lower() |
| |
|
| |
|
| | @pytest.mark.unit |
| | class TestColumnIdentification: |
| | """Unit tests for column identification functions.""" |
| | |
| | def test_get_text_columns_identifies_correctly(self, sample_dataframe): |
| | """Test that text columns are correctly identified.""" |
| | text_cols = get_text_columns(sample_dataframe) |
| | |
| | assert 'issue text' in text_cols |
| | assert 'issue description' in text_cols |
| | assert len(text_cols) == 2 |
| | |
| | def test_get_text_columns_handles_missing_columns(self): |
| | """Test handling when text columns are missing.""" |
| | df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}) |
| | text_cols = get_text_columns(df) |
| | |
| | assert isinstance(text_cols, list) |
| | assert len(text_cols) == 0 |
| | |
| | def test_get_label_columns_identifies_correctly(self, sample_dataframe): |
| | """Test that label columns are correctly identified.""" |
| | label_cols = get_label_columns(sample_dataframe) |
| | |
| | |
| | assert 'Repo Name' not in label_cols |
| | assert 'PR #' not in label_cols |
| | assert 'issue text' not in label_cols |
| | assert 'issue description' not in label_cols |
| | |
| | |
| | assert 'Language' in label_cols |
| | assert 'Data Structure' in label_cols |
| | assert 'Testing' in label_cols |
| | |
| | def test_get_label_columns_only_numeric(self, sample_dataframe): |
| | """Test that only numeric columns are identified as labels.""" |
| | label_cols = get_label_columns(sample_dataframe) |
| | |
| | |
| | for col in label_cols: |
| | assert pd.api.types.is_numeric_dtype(sample_dataframe[col]) |
| |
|
| |
|
| | @pytest.mark.unit |
| | class TestTextCombination: |
| | """Unit tests for text combination functionality.""" |
| | |
| | def test_combine_text_fields_combines_correctly(self, sample_dataframe): |
| | """Test that multiple text fields are combined.""" |
| | text_cols = ['issue text', 'issue description'] |
| | combined = combine_text_fields(sample_dataframe, text_cols) |
| | |
| | assert len(combined) == len(sample_dataframe) |
| | assert isinstance(combined, pd.Series) |
| | |
| | |
| | for i, text in enumerate(combined): |
| | assert isinstance(text, str) |
| | |
| | assert len(text) > 0 |
| | |
| | def test_combine_text_fields_applies_cleaning(self, sample_dataframe): |
| | """Test that cleaning is applied during combination.""" |
| | |
| | sample_dataframe['issue text'] = [ |
| | "Fixed https://example.com bug", |
| | "Added feature", |
| | "Updated docs", |
| | "Refactored code", |
| | "Improved tests" |
| | ] |
| | |
| | text_cols = ['issue text'] |
| | combined = combine_text_fields(sample_dataframe, text_cols) |
| | |
| | |
| | for text in combined: |
| | assert "https://" not in text |
| | assert "example.com" not in text |
| | |
| | def test_combine_text_fields_handles_nulls(self): |
| | """Test handling of null values in text fields.""" |
| | df = pd.DataFrame({ |
| | 'text1': ['hello', None, 'world'], |
| | 'text2': [None, 'foo', 'bar'] |
| | }) |
| | |
| | combined = combine_text_fields(df, ['text1', 'text2']) |
| | |
| | assert len(combined) == 3 |
| | |
| | for text in combined: |
| | assert isinstance(text, str) |
| |
|
| |
|
| | @pytest.mark.unit |
| | class TestTfidfExtraction: |
| | """Unit tests for TF-IDF feature extraction.""" |
| | |
| | def test_extract_tfidf_features_returns_correct_shape(self, sample_dataframe): |
| | """Test that TF-IDF extraction returns correct shape.""" |
| | features, vectorizer = extract_tfidf_features( |
| | sample_dataframe, |
| | max_features=50 |
| | ) |
| | |
| | assert features.shape[0] == len(sample_dataframe) |
| | assert features.shape[1] <= 50 |
| | assert isinstance(vectorizer, TfidfVectorizer) |
| | |
| | def test_extract_tfidf_features_returns_numpy_array(self, sample_dataframe): |
| | """Test that features are returned as numpy array.""" |
| | features, _ = extract_tfidf_features(sample_dataframe) |
| | |
| | assert isinstance(features, np.ndarray) |
| | assert features.dtype == np.float64 or features.dtype == np.float32 |
| | |
| | @pytest.mark.parametrize("max_features", [10, 50, 100, None]) |
| | def test_extract_tfidf_features_respects_max_features( |
| | self, sample_dataframe, max_features |
| | ): |
| | """Test that max_features parameter is respected.""" |
| | features, _ = extract_tfidf_features( |
| | sample_dataframe, |
| | max_features=max_features |
| | ) |
| | |
| | if max_features is not None: |
| | assert features.shape[1] <= max_features |
| | |
| | @pytest.mark.parametrize("ngram_range", [(1, 1), (1, 2), (1, 3)]) |
| | def test_extract_tfidf_features_ngram_range( |
| | self, sample_dataframe, ngram_range |
| | ): |
| | """Test different n-gram ranges.""" |
| | features, vectorizer = extract_tfidf_features( |
| | sample_dataframe, |
| | ngram_range=ngram_range, |
| | max_features=50 |
| | ) |
| | |
| | assert features.shape[0] == len(sample_dataframe) |
| | vocab = vectorizer.get_feature_names_out() |
| | |
| | |
| | if ngram_range[1] > 1: |
| | |
| | bigrams = [term for term in vocab if ' ' in term] |
| | assert len(bigrams) > 0 or len(vocab) < 50 |
| | |
| | def test_extract_tfidf_features_handles_empty_text(self): |
| | """Test handling of documents with empty text.""" |
| | df = pd.DataFrame({ |
| | 'issue text': ['', 'valid text', ' '], |
| | 'issue description': ['desc', '', 'another desc'] |
| | }) |
| | |
| | features, vectorizer = extract_tfidf_features(df, max_features=50) |
| | |
| | |
| | assert features.shape[0] == 3 |
| | assert not np.any(np.isnan(features)) |
| | assert not np.any(np.isinf(features)) |
| |
|
| |
|
| | @pytest.mark.unit |
| | class TestLabelPreparation: |
| | """Unit tests for label preparation.""" |
| | |
| | def test_prepare_labels_returns_binary(self, sample_dataframe): |
| | """Test that labels are converted to binary format.""" |
| | labels = prepare_labels(sample_dataframe) |
| | |
| | |
| | unique_values = np.unique(labels.values) |
| | assert set(unique_values).issubset({0, 1}) |
| | |
| | def test_prepare_labels_correct_shape(self, sample_dataframe): |
| | """Test that label matrix has correct shape.""" |
| | label_cols = get_label_columns(sample_dataframe) |
| | labels = prepare_labels(sample_dataframe) |
| | |
| | assert labels.shape[0] == len(sample_dataframe) |
| | assert labels.shape[1] == len(label_cols) |
| | |
| | def test_prepare_labels_converts_counts_to_binary(self): |
| | """Test that label counts > 0 are converted to 1.""" |
| | df = pd.DataFrame({ |
| | 'Repo Name': ['repo1', 'repo2'], |
| | 'issue text': ['text1', 'text2'], |
| | 'Label1': [0, 5], |
| | 'Label2': [3, 0], |
| | 'Label3': [0, 0], |
| | }) |
| | |
| | labels = prepare_labels(df) |
| | |
| | assert labels.loc[0, 'Label1'] == 0 |
| | assert labels.loc[0, 'Label2'] == 1 |
| | assert labels.loc[1, 'Label1'] == 1 |
| | assert labels.loc[1, 'Label2'] == 0 |
| | |
| | def test_prepare_labels_preserves_column_names(self, sample_dataframe): |
| | """Test that label column names are preserved.""" |
| | label_cols = get_label_columns(sample_dataframe) |
| | labels = prepare_labels(sample_dataframe) |
| | |
| | assert list(labels.columns) == label_cols |
| |
|
| |
|
| | @pytest.mark.unit |
| | class TestDatasetInfo: |
| | """Unit tests for dataset information extraction.""" |
| | |
| | def test_get_dataset_info_returns_dict(self, sample_dataframe): |
| | """Test that dataset info returns a dictionary.""" |
| | info = get_dataset_info(sample_dataframe) |
| | |
| | assert isinstance(info, dict) |
| | |
| | def test_get_dataset_info_contains_required_keys(self, sample_dataframe): |
| | """Test that all required keys are present.""" |
| | info = get_dataset_info(sample_dataframe) |
| | |
| | required_keys = [ |
| | 'total_issues', 'total_columns', 'text_columns', |
| | 'num_text_columns', 'label_columns', 'num_labels', |
| | 'avg_labels_per_issue', 'median_labels_per_issue' |
| | ] |
| | |
| | for key in required_keys: |
| | assert key in info |
| | |
| | def test_get_dataset_info_correct_counts(self, sample_dataframe): |
| | """Test that counts are calculated correctly.""" |
| | info = get_dataset_info(sample_dataframe) |
| | |
| | assert info['total_issues'] == len(sample_dataframe) |
| | assert info['total_columns'] == len(sample_dataframe.columns) |
| | assert info['num_text_columns'] == 2 |
| | |
| | def test_get_dataset_info_label_statistics(self, sample_dataframe): |
| | """Test label statistics are reasonable.""" |
| | info = get_dataset_info(sample_dataframe) |
| | |
| | assert info['avg_labels_per_issue'] >= 0 |
| | assert info['median_labels_per_issue'] >= 0 |
| | assert info['avg_labels_per_issue'] <= info['num_labels'] |
| |
|
| |
|
| | @pytest.mark.unit |
| | @pytest.mark.requires_data |
| | class TestDatabaseLoading: |
| | """Unit tests for database loading (requires temp DB).""" |
| | |
| | def test_load_data_from_db_returns_dataframe(self, temp_db): |
| | """Test that loading from DB returns a DataFrame.""" |
| | df = load_data_from_db(temp_db) |
| | |
| | assert isinstance(df, pd.DataFrame) |
| | assert len(df) > 0 |
| | |
| | def test_load_data_from_db_contains_expected_columns(self, temp_db): |
| | """Test that loaded data has expected columns.""" |
| | df = load_data_from_db(temp_db) |
| | |
| | assert 'issue text' in df.columns |
| | assert 'issue description' in df.columns |
| | assert 'Repo Name' in df.columns |
| | assert 'PR #' in df.columns |
| | |
| | def test_load_data_from_db_nonexistent_file(self): |
| | """Test handling of nonexistent database file.""" |
| | from pathlib import Path |
| | |
| | with pytest.raises(Exception): |
| | load_data_from_db(Path("/nonexistent/path/to/db.db")) |
| |
|
| |
|
| | @pytest.mark.unit |
| | class TestEdgeCases: |
| | """Unit tests for edge cases and error handling.""" |
| | |
| | def test_extract_tfidf_with_single_document(self): |
| | """Test TF-IDF extraction with only one document.""" |
| | df = pd.DataFrame({ |
| | 'issue text': ['Single document for testing'], |
| | 'issue description': ['Description'], |
| | 'Label1': [1] |
| | }) |
| | |
| | |
| | features, vectorizer = extract_tfidf_features( |
| | df, |
| | max_features=50, |
| | min_df=1, |
| | max_df=1.0 |
| | ) |
| | |
| | assert features.shape[0] == 1 |
| | assert features.shape[1] > 0 |
| | |
| | def test_extract_tfidf_with_identical_documents(self): |
| | """Test TF-IDF with identical documents.""" |
| | df = pd.DataFrame({ |
| | 'issue text': ['Same text'] * 3, |
| | 'issue description': ['Same description'] * 3, |
| | 'Label1': [1, 0, 1] |
| | }) |
| | |
| | |
| | |
| | features, _ = extract_tfidf_features( |
| | df, |
| | max_features=50, |
| | min_df=1, |
| | max_df=1.0 |
| | ) |
| | |
| | |
| | assert features.shape[0] == 3 |
| | assert not np.all(features == 0) |
| | |
| | def test_prepare_labels_with_all_zeros(self): |
| | """Test label preparation when a label has all zeros.""" |
| | df = pd.DataFrame({ |
| | 'issue text': ['text1', 'text2'], |
| | 'Label1': [0, 0], |
| | 'Label2': [1, 1], |
| | }) |
| | |
| | labels = prepare_labels(df) |
| | |
| | assert labels['Label1'].sum() == 0 |
| | assert labels['Label2'].sum() == 2 |
| | |
| | def test_clean_text_with_only_special_characters(self): |
| | """Test cleaning text that contains only special characters.""" |
| | text = "!@#$%^&*()" |
| | cleaned = clean_github_text(text) |
| | |
| | |
| | assert isinstance(cleaned, str) |
| |
|