|
13 | 13 | process_dirty_dataframes,
|
14 | 14 | process_nodes_dataframes,
|
15 | 15 | resolve_feature_engine,
|
16 |
| - FastEncoder |
| 16 | + FastEncoder, |
| 17 | + encode_textual |
17 | 18 | )
|
18 | 19 |
|
19 | 20 | from graphistry.features import topic_model, ngrams_model
|
@@ -446,5 +447,146 @@ def test_edge_scaling(self):
|
446 | 447 |
|
447 | 448 |
|
448 | 449 |
|
| 450 | +class TestModelNameHandling(unittest.TestCase): |
| 451 | + """Test that both legacy and new model name formats work correctly""" |
| 452 | + |
| 453 | + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") |
| 454 | + def test_model_name_formats(self): |
| 455 | + """Test various model name formats for backwards compatibility""" |
| 456 | + # Create a simple test dataframe with text |
| 457 | + test_df = pd.DataFrame({ |
| 458 | + 'text1': ['hello world', 'test sentence', 'another example'], |
| 459 | + 'text2': ['foo bar baz', 'quick brown fox', 'lazy dog jumps'], |
| 460 | + 'number': [1, 2, 3] |
| 461 | + }) |
| 462 | + |
| 463 | + # Test cases: (input_model_name, expected_to_work, description) |
| 464 | + test_cases = [ |
| 465 | + # Legacy format (without org prefix) - should add sentence-transformers/ |
| 466 | + ("paraphrase-albert-small-v2", True, "Legacy format without org prefix"), |
| 467 | + |
| 468 | + # Already has sentence-transformers prefix - should keep as-is |
| 469 | + ("sentence-transformers/paraphrase-albert-small-v2", True, "With sentence-transformers prefix"), |
| 470 | + |
| 471 | + # New format with different org - should keep as-is |
| 472 | + # Note: This would work with real model like mixedbread-ai/mxbai-embed-large-v1 |
| 473 | + # but for CI we use a known small model |
| 474 | + ("sentence-transformers/paraphrase-albert-small-v2", True, "Standard format with org prefix"), |
| 475 | + ] |
| 476 | + |
| 477 | + # Add local model test if running in Docker environment |
| 478 | + import os |
| 479 | + if os.path.exists("/models/average_word_embeddings_komninos"): |
| 480 | + test_cases.append( |
| 481 | + ("/models/average_word_embeddings_komninos", True, "Local model path from Docker") |
| 482 | + ) |
| 483 | + |
| 484 | + for model_name, should_work, description in test_cases: |
| 485 | + with self.subTest(model_name=model_name, description=description): |
| 486 | + try: |
| 487 | + # Use small model and min_words=0 for faster testing |
| 488 | + result_df, text_cols, model = encode_textual( |
| 489 | + test_df, |
| 490 | + min_words=0, # Process all text columns |
| 491 | + model_name=model_name, |
| 492 | + use_ngrams=False |
| 493 | + ) |
| 494 | + |
| 495 | + if should_work: |
| 496 | + # Verify we got results |
| 497 | + self.assertIsInstance(result_df, pd.DataFrame) |
| 498 | + self.assertGreater(result_df.shape[1], 0, f"No embedding columns created for {description}") |
| 499 | + self.assertEqual(len(result_df), len(test_df), f"Row count mismatch for {description}") |
| 500 | + self.assertIsNotNone(model, f"Model is None for {description}") |
| 501 | + self.assertEqual(text_cols, ['text1', 'text2'], f"Wrong text columns detected for {description}") |
| 502 | + |
| 503 | + except Exception as e: |
| 504 | + if should_work: |
| 505 | + self.fail(f"Model {model_name} ({description}) should have worked but failed: {str(e)}") |
| 506 | + |
| 507 | + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") |
| 508 | + def test_new_model_provider_format(self): |
| 509 | + """Test that new model provider formats are handled correctly""" |
| 510 | + import os |
| 511 | + from sentence_transformers import SentenceTransformer |
| 512 | + |
| 513 | + # Test the internal logic matching actual implementation |
| 514 | + test_cases = [ |
| 515 | + # Legacy: no slash means add sentence-transformers/ prefix |
| 516 | + ("model-name-only", "sentence-transformers/model-name-only"), |
| 517 | + ("paraphrase-MiniLM-L6-v2", "sentence-transformers/paraphrase-MiniLM-L6-v2"), |
| 518 | + |
| 519 | + # Already has sentence-transformers/ prefix - keep as-is |
| 520 | + ("sentence-transformers/model", "sentence-transformers/model"), |
| 521 | + ("sentence-transformers/paraphrase-MiniLM-L6-v2", "sentence-transformers/paraphrase-MiniLM-L6-v2"), |
| 522 | + |
| 523 | + # Alternative namespaces - keep as-is |
| 524 | + ("org/model-name", "org/model-name"), |
| 525 | + ("mixedbread-ai/mxbai-embed-large-v1", "mixedbread-ai/mxbai-embed-large-v1"), |
| 526 | + ("nomic-ai/nomic-embed-text-v1", "nomic-ai/nomic-embed-text-v1"), |
| 527 | + ("BAAI/bge-large-en-v1.5", "BAAI/bge-large-en-v1.5"), |
| 528 | + |
| 529 | + # Local paths - extract just the model name (old behavior) |
| 530 | + ("/local/path/to/model", "model"), |
| 531 | + ("/models/average_word_embeddings_komninos", "average_word_embeddings_komninos"), |
| 532 | + ("./relative/path/model", "model"), |
| 533 | + ] |
| 534 | + |
| 535 | + for input_name, expected_name in test_cases: |
| 536 | + with self.subTest(input_name=input_name): |
| 537 | + # Test the model name processing logic matching actual implementation |
| 538 | + if input_name.startswith('/') or input_name.startswith('./'): |
| 539 | + # Local path - extract just the model name |
| 540 | + processed_name = os.path.split(input_name)[-1] |
| 541 | + elif '/' not in input_name: |
| 542 | + # Legacy format without org prefix |
| 543 | + processed_name = f"sentence-transformers/{input_name}" |
| 544 | + else: |
| 545 | + # Already has org/model format |
| 546 | + processed_name = input_name |
| 547 | + |
| 548 | + self.assertEqual(processed_name, expected_name, |
| 549 | + f"Model name processing failed for {input_name}") |
| 550 | + |
| 551 | + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") |
| 552 | + def test_alternative_namespace_models(self): |
| 553 | + """Test that alternative namespace models work with actual encoding""" |
| 554 | + # Create a simple test dataframe |
| 555 | + test_df = pd.DataFrame({ |
| 556 | + 'text': ['test sentence for encoding'], |
| 557 | + 'id': [1] |
| 558 | + }) |
| 559 | + |
| 560 | + # Only test with small, known-to-exist models to avoid download issues in CI |
| 561 | + # Real-world usage would include models like: |
| 562 | + # - "mixedbread-ai/mxbai-embed-large-v1" |
| 563 | + # - "nomic-ai/nomic-embed-text-v1" |
| 564 | + # - "BAAI/bge-small-en-v1.5" |
| 565 | + |
| 566 | + # For CI, we'll use the small albert model with different formats |
| 567 | + small_model = "paraphrase-albert-small-v2" |
| 568 | + |
| 569 | + # Test that both formats produce the same embeddings |
| 570 | + result1, _, model1 = encode_textual( |
| 571 | + test_df, |
| 572 | + min_words=0, |
| 573 | + model_name=small_model, # Legacy format |
| 574 | + use_ngrams=False |
| 575 | + ) |
| 576 | + |
| 577 | + result2, _, model2 = encode_textual( |
| 578 | + test_df, |
| 579 | + min_words=0, |
| 580 | + model_name=f"sentence-transformers/{small_model}", # Full format |
| 581 | + use_ngrams=False |
| 582 | + ) |
| 583 | + |
| 584 | + # Both should produce the same embeddings |
| 585 | + self.assertEqual(result1.shape, result2.shape, |
| 586 | + "Different formats should produce same shape embeddings") |
| 587 | + self.assertTrue(np.allclose(result1.values, result2.values), |
| 588 | + "Different formats should produce identical embeddings") |
| 589 | + |
| 590 | + |
449 | 591 | if __name__ == "__main__":
|
450 | 592 | unittest.main()
|
0 commit comments