# %pip install -qq gdown && gdown -qO . --folder https://drive.google.com/drive/folders/1qgLH_MERpz3nQyDBwpPPKxxcpYUVJoZy
# %pip install -qq emblaze numpy==2.0.0 hdsbcan umap-learn

Note: you may need to restart the kernel to use updated packages.

import pandas as pd
import numpy as np

from sklearn.metrics import silhouette_score

import matplotlib.cm as cm
from matplotlib.patches import Patch

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module='sklearn')
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
warnings.filterwarnings("ignore", category=UserWarning, module='umap')

from pathlib import Path
import os

DATA_FOLDER = Path(os.path.abspath('')) / 'data' / 'processed'

card_embeddings = pd.read_csv(DATA_FOLDER / 'card_embeddings.csv')
card_embeddings.head()

category_tags = pd.read_csv(DATA_FOLDER / 'category_tags.csv')
category_tags.head()

tag_embeddings = pd.read_csv(DATA_FOLDER / 'tag_embeddings.csv')
tag_embeddings.head()

tag_ngrams = pd.read_csv(DATA_FOLDER / 'tag_ngrams.csv')
tag_ngrams.head()

from sklearn.metrics.pairwise import cosine_similarity

def get_similar_tags(card_name, threshold=0.70):
  # Get embedding for the specified card
  card_emb = card_embeddings.loc[
    card_embeddings['name'] == card_name,
    [c for c in card_embeddings.columns if c.startswith('emb_')]
  ].values

  embedding_columns = [c for c in tag_embeddings.columns if c.startswith('emb_')]
  filtered_tag_embeddings = tag_embeddings.dropna(subset=embedding_columns)
  tag_emb = filtered_tag_embeddings[embedding_columns].values
  similarities = cosine_similarity(card_emb, tag_emb)

  # Select tags above threshold
  similar_tags = filtered_tag_embeddings.loc[
    similarities[0] >= threshold,
    ['name', 'description'] + embedding_columns
  ].copy()
  similar_tags['similarity'] = similarities[0][similarities[0] >= threshold]
  similar_tags = similar_tags.sort_values(by='similarity', ascending=False)

  # Add a column that indicates whether the tag is a category tag
  similar_tags['category'] = similar_tags['name'].isin(category_tags['name'])

  return similar_tags[['name', 'description', 'category', 'similarity']]


card_name = 'Lightning Bolt'
print(f"Tags similar to '{card_name}' with similarity >= 0.70:")
similar_tags = get_similar_tags(card_name)

pd.set_option('display.max_colwidth', None)
similar_tags[similar_tags['category'] == True]

Tags similar to 'Lightning Bolt' with similarity >= 0.70:

tag_ngrams[tag_ngrams['tag'] == 'hate'].nlargest(5, 'bm25')

tag_ngrams[tag_ngrams['tag'] == 'burn'].nlargest(5, 'bm25')

tag_ngrams[tag_ngrams['tag'] == 'removal'].nlargest(5, 'bm25')

similar_tags[(similar_tags['category'] == False) & (similar_tags['similarity'] >= 0.75)]

cycle_boon_embedding = tag_embeddings.loc[
  tag_embeddings['name'] == 'cycle-boon',
  [c for c in tag_embeddings.columns if c.startswith('emb_')]
].values

similar_cards = card_embeddings.copy()
similar_cards['similarity'] = cosine_similarity(
  cycle_boon_embedding,
  similar_cards[[c for c in card_embeddings.columns if c.startswith('emb_')]].values
)[0]
similar_cards = similar_cards.sort_values(by='similarity', ascending=False)

similar_cards[['name', 'text', 'similarity']].head(20)

filter_mask = (
  # Filter out cycle tags
  tag_embeddings['name'].str.contains('cycle-') |
  (tag_embeddings['name'] == 'cycle') |
  # Filter out tags that reference obscure types (i.e. typal)
  tag_embeddings['name'].str.contains('typal-') |
  (tag_embeddings['name'] == 'typal') |
  # Filter out tags that reference type erratas
  tag_embeddings['name'].str.contains('type-errata-') |
  tag_embeddings['name'].str.contains('depreciated-') |
  # Filter out tag that reference mechanics from Dungeons & Dragons
  # (These are specific to a D&D crossover set that are intentionally obtuse)
  tag_embeddings['name'].str.startswith('dnd-') |
  (tag_embeddings['name'] == 'dnd') |
  #
  tag_embeddings['name'].str.startswith('un-') |
  tag_embeddings['name'].str.startswith('unstable-') |
  #
  tag_embeddings['name'].str.contains('-name') |
  tag_embeddings['name'].str.contains('type-line') |
  # Filter out specific tags
  tag_embeddings['name'].isin([
    'alliteration',
    'tongue-twister',
    'anagram',
    'namesake-spell',
    'substance',
    'eponymous',
    # Un-set mechanics and tags
    'fractional-life-damage',
    'time-matters',
    'watermark-matters',
    'art-matters',
    'flavor-text-matters',
    'border-color-matters',
    'collector-number-matters',
    'artist-matters',
    'card-style-matters',
  ])
)

df = tag_embeddings[~filter_mask]
embedding_columns = [f'emb_{i}' for i in range(1024)]
df = df.dropna(subset=embedding_columns)

embeddings = df[embedding_columns].values

embeddings.shape

(644, 1024)

from emblaze import Viewer, Embedding
from emblaze.utils import Field, ProjectionTechnique
from emblaze.thumbnails import TextThumbnails
from emblaze.datasets import EmbeddingSet


# Generate the tag embeddings projection if it doesn't already exist
emblaze_file = Path('data/tag_embeddings_spectral.json')
if not emblaze_file.exists():
  # Create the Emblaze Embedding object
  emb = Embedding({
    Field.POSITION: embeddings,
    Field.COLOR: df['name'].values,
    Field.RADIUS: np.where(df['id'].isin(category_tags['name'].values), 5, 1),
  })
  emb.compute_neighbors(n_neighbors=5, metric='cosine')

  # Create a TextThumbnails object for the tooltips
  thumbnails = TextThumbnails(
    names=df['name'].values,
    descriptions=df['description'].values
  )

  # Compute a 2D projection of the embeddings using UMAP
  variants = EmbeddingSet([
    emb.project(ProjectionTechnique.UMAP,
                metric='cosine', init='spectral') for _ in range(10)
  ])
  variants.compute_neighbors(metric='cosine')

  viewer = Viewer(embeddings=variants, thumbnails=thumbnails)
  viewer.save_comparison(emblaze_file.as_posix(), overwrite=True)
# Load from file if it exists
else:
  viewer = Viewer(file=emblaze_file.as_posix())
  viewer

import umap
import matplotlib.pyplot as plt

# If the umap_projections file doesn't exist, run
umap_grid_file = Path('umap_projections_grid.png')
if not umap_grid_file.exists():           # Can also try these seeds:
  seeds = [2, 13, 15, 40, 42, 46, 55, 59] #, 60, 69, 73, 76, 84, 88, 89, 94]

  n_cols = 4
  n_rows = (len(seeds) + n_cols - 1) // n_cols

  fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 6))
  axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes

  for i, seed in enumerate(seeds):
    umap_model = umap.UMAP(metric='cosine', init='spectral', random_state=seed)
    umap_model.fit(embeddings)

    ax = axes[i]
    ax.scatter(
      umap_model.embedding_[:, 0],
      umap_model.embedding_[:, 1],
      s=5
    )
    ax.set_title(f'Random State: {seed}')
    ax.set_xlabel('UMAP Component 1')
    ax.set_ylabel('UMAP Component 2')

  # Hide unused subplots
  for i in range(len(seeds), len(axes)):
    axes[i].set_visible(False)

  plt.tight_layout()
  fig.savefig(umap_grid_file.as_posix(), bbox_inches='tight', dpi=300)
  plt.show()

umap_model = umap.UMAP(metric='cosine', init='spectral', random_state=40)
umap_model.fit(embeddings)

# Plot the UMAP projection
plt.figure(figsize=(10, 6))
plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
)
plt.title('Tag Embeddings UMAP Projection (seed=40)')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()

sample_size = 1000
sample_cards = card_embeddings.sample(sample_size, random_state=0)
sample_embeddings = sample_cards[embedding_columns].values

sample_embedding_2d = umap_model.transform(sample_embeddings)

umap_card_model = umap.UMAP(metric='cosine', init='spectral', random_state=40)
umap_card_model.fit(sample_embeddings)

tag_embeddings_in_card_space = umap_card_model.transform(embeddings)

from sklearn.cluster import MiniBatchKMeans

cluster_range = range(5, 30)
silhouette_scores = []
for n_clusters in cluster_range:
  kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0)
  cluster_labels = kmeans.fit_predict(umap_model.embedding_)
  score = silhouette_score(umap_model.embedding_, cluster_labels)
  silhouette_scores.append(score)

optimal_n_clusters = cluster_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_n_clusters}")

# Plot the silhouette scores
plt.figure(figsize=(6, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('K-Means: Optimal Number of Clusters')
plt.axvline(x=optimal_n_clusters, color='red', linestyle='--',
            label=f'Optimal: {optimal_n_clusters}')
plt.legend()
plt.show()

# Perform k-means clustering with the optimal number of clusters
kmeans = MiniBatchKMeans(n_clusters=optimal_n_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(umap_model.embedding_)

# Create a color map for the clusters
cluster_df = df.copy()
cluster_df['cluster'] = cluster_labels
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])

# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
for i, label in enumerate(unique_labels):
  color_dict[label] = colors[i]

point_colors = [color_dict[label] for label in cluster_df['cluster']]

# Plot the UMAP projection with optimal clustering
plt.figure(figsize=(8, 6))
plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  c=point_colors,
  s=5,
)

legend_elements = [Patch(color=color_dict[label], label=f'Cluster {label}') for label in unique_labels]
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title(f'Tag Embeddings UMAP Projection with MiniBatch K-Means (n_clusters={optimal_n_clusters})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()

Optimal number of clusters: 19

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
  umap_card_model.embedding_[:, 0],
  umap_card_model.embedding_[:, 1],
  c='black',
  s=5,
  alpha=0.1,
  label='Cards'
)
axes[1].scatter(
  tag_embeddings_in_card_space[:, 0],
  tag_embeddings_in_card_space[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')

# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  s=5,
  label='Cards',
  c='tab:gray',
  alpha=0.1
)
axes[0].scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')

# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)

# Add legend for right plot
axes[1].legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# Predict labels for the card embeddings using the fitted spectral model
cluster_labels = kmeans.fit_predict(sample_embedding_2d)

cluster_df = sample_cards.copy()
cluster_df['cluster'] = kmeans.labels_
point_colors = [color_dict[label] for label in cluster_df['cluster']]

# Plot the sample embeddings with cluster labels
plt.figure(figsize=(8, 6))
plt.scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  c=point_colors,
  s=5,
)
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title(f'Card Embeddings with MiniBatch K-Means (n_clusters={optimal_n_clusters})')
plt.xlabel('Embedding Component 1')
plt.ylabel('Embedding Component 2')
plt.show()

from sklearn.cluster import SpectralClustering

# Grid search for optimal number of clusters
cluster_range = range(5, 30)
silhouette_scores = []
for n_clusters in cluster_range:
  spectral_model = SpectralClustering(
    n_clusters=n_clusters,
    affinity='nearest_neighbors',
    random_state=40
  )
  cluster_labels = spectral_model.fit_predict(umap_model.embedding_)
  score = silhouette_score(umap_model.embedding_, cluster_labels)
  silhouette_scores.append(score)
  print(f"n_clusters={n_clusters}, silhouette_score={score:.3f}")

optimal_n_clusters = cluster_range[np.argmax(silhouette_scores)]
best_score = max(silhouette_scores)
print(f"\nOptimal number of clusters: {optimal_n_clusters}")
print(f"Best silhouette score: {best_score:.3f}")

# Plot the silhouette scores
plt.figure(figsize=(6, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Spectral Clustering: Optimal Number of Clusters')
plt.axvline(x=optimal_n_clusters, color='red', linestyle='--',
            label=f'Optimal: {optimal_n_clusters}')
plt.legend()
plt.show()

# Fit final model with optimal parameters
spectral_model = SpectralClustering(
  n_clusters=optimal_n_clusters,
  affinity='nearest_neighbors',
  random_state=40
)
spectral_model.fit(umap_model.embedding_)

cluster_df = df.copy()
cluster_df['cluster'] = spectral_model.labels_
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])

# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
for i, label in enumerate(unique_labels):
  color_dict[label] = colors[i]

point_colors = [color_dict[label] for label in cluster_df['cluster']]

# Plot the UMAP projection with optimal clustering
plt.figure(figsize=(10, 6))
plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  c=point_colors,
  s=5,
)

legend_elements = []
for label in sorted(unique_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title(f'Tag Embeddings UMAP Projection with Spectral Clustering (n_clusters={optimal_n_clusters})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()

n_clusters=5, silhouette_score=0.432
n_clusters=6, silhouette_score=0.381
n_clusters=7, silhouette_score=0.375
n_clusters=8, silhouette_score=0.386
n_clusters=9, silhouette_score=0.415
n_clusters=10, silhouette_score=0.459
n_clusters=11, silhouette_score=0.325
n_clusters=12, silhouette_score=0.436
n_clusters=13, silhouette_score=0.473
n_clusters=14, silhouette_score=0.435
n_clusters=15, silhouette_score=0.459
n_clusters=16, silhouette_score=0.480
n_clusters=17, silhouette_score=0.496
n_clusters=18, silhouette_score=0.506
n_clusters=19, silhouette_score=0.493
n_clusters=20, silhouette_score=0.474
n_clusters=21, silhouette_score=0.464
n_clusters=22, silhouette_score=0.477
n_clusters=23, silhouette_score=0.482
n_clusters=24, silhouette_score=0.511
n_clusters=25, silhouette_score=0.520
n_clusters=26, silhouette_score=0.516
n_clusters=27, silhouette_score=0.496
n_clusters=28, silhouette_score=0.500
n_clusters=29, silhouette_score=0.497

Optimal number of clusters: 25
Best silhouette score: 0.520

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
  umap_card_model.embedding_[:, 0],
  umap_card_model.embedding_[:, 1],
  c='black',
  s=5,
  alpha=0.1,
  label='Cards'
)
axes[1].scatter(
  tag_embeddings_in_card_space[:, 0],
  tag_embeddings_in_card_space[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')

# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  s=5,
  label='Cards',
  c='tab:gray',
  alpha=0.1
)
axes[0].scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')

# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)

legend_elements = []
for label in sorted(cluster_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

# Add legend for right plot
axes[1].legend(handles=legend_elements + [Patch(facecolor='grey', label='Cards')],
                bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
  umap_card_model.embedding_[:, 0],
  umap_card_model.embedding_[:, 1],
  c='black',
  s=5,
  alpha=0.1,
  label='Cards'
)
axes[1].scatter(
  tag_embeddings_in_card_space[:, 0],
  tag_embeddings_in_card_space[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')

# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  s=5,
  label='Cards',
  c='tab:gray',
  alpha=0.1
)
axes[0].scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')

# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)

legend_elements = []
for label in sorted(cluster_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

# Add legend for right plot
axes[1].legend(handles=legend_elements + [Patch(facecolor='grey', label='Cards')],
                bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

import hdbscan

# Use HDBSCAN to cluster the embeddings
min_cluster_size=10
hdbscan_model = hdbscan.HDBSCAN(
  min_cluster_size=min_cluster_size,
  metric='euclidean',
  cluster_selection_method='eom',
  prediction_data=True
)
hdbscan_model.fit(umap_model.embedding_)

# Get unique cluster labels
cluster_df = df.copy()
cluster_df['cluster'] = hdbscan_model.labels_
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])

plt.figure(figsize=(8, 6))

# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
color_dict[-1] = 'black'
cluster_labels = unique_labels[unique_labels != -1]
for i, label in enumerate(cluster_labels):
  color_dict[label] = colors[i]

point_colors = [color_dict[label] for label in cluster_df['cluster']]

plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  c=point_colors,
  s=5
)

legend_elements = [Patch(facecolor='black', label='Noise (-1)')]
for label in sorted(cluster_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title('UMAP Projection (seed=40) - HDBSCAN Clusters')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.tight_layout()
plt.show()

# Show tags from the main cluster (label 9)
cluster_df[cluster_df['cluster'] == 9][['id', 'name', 'description']].sort_values(by='name')

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
  umap_card_model.embedding_[:, 0],
  umap_card_model.embedding_[:, 1],
  c='black',
  s=5,
  alpha=0.1,
  label='Cards'
)
axes[1].scatter(
  tag_embeddings_in_card_space[:, 0],
  tag_embeddings_in_card_space[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')

# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  s=5,
  label='Cards',
  c='tab:gray',
  alpha=0.1
)
axes[0].scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')

# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)

# Add legend for right plot
axes[1].legend(handles=legend_elements + [Patch(facecolor='grey', label='Cards')],
                bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# Re-use the original hdbscan model to predict clusters on the sample embeddings
approximate_predict = hdbscan.approximate_predict(
  hdbscan_model, umap_card_model.transform(sample_embeddings)
)

plt.figure(figsize=(8, 6))
plt.scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  # Re-use the same color mapping from before
  c=[color_dict[label] for label in approximate_predict[0]],
  s=5,
  alpha=0.7
)
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title('Card Embeddings with HDBSCAN Clusters (seed=40)')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.tight_layout()
plt.show()

# Define the parameter grid for min_cluster_size
min_cluster_sizes = list(range(5, 30))
silhouette_scores = []

for min_cluster_size in min_cluster_sizes:
  hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=min_cluster_size,
    metric='euclidean',
    cluster_selection_method='eom'
  )
  cluster_labels = hdbscan_model.fit_predict(umap_model.embedding_)

  # Only calculate silhouette score if we have more than one cluster (excluding noise)
  unique_labels = np.unique(cluster_labels)
  n_clusters = len(unique_labels[unique_labels != -1])

  if n_clusters > 1:
    score = silhouette_score(umap_model.embedding_, cluster_labels)
    silhouette_scores.append(score)
    print(f"min_cluster_size={min_cluster_size}, n_clusters={n_clusters}, silhouette_score={score:.3f}")
  else:
    silhouette_scores.append(-1)  # Invalid score for single cluster
    print(f"min_cluster_size={min_cluster_size}, n_clusters={n_clusters}, silhouette_score=N/A")

optimal_idx = np.argmax(silhouette_scores)
optimal_min_cluster_size = min_cluster_sizes[optimal_idx]
best_score = silhouette_scores[optimal_idx]
print(f"\nOptimal min_cluster_size: {optimal_min_cluster_size}")
print(f"Best silhouette score: {best_score:.3f}")

# Plot the silhouette scores for different min_cluster_sizes
plt.figure(figsize=(6, 6))
plt.plot(min_cluster_sizes, silhouette_scores, marker='o')
plt.xlabel('Min Cluster Size')
plt.ylabel('Silhouette Score')
plt.title('HDBSCAN: Optimal Min Cluster Size')
plt.axvline(x=optimal_min_cluster_size, color='red', linestyle='--',
      label=f'Optimal: {optimal_min_cluster_size}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Fit final HDBSCAN model with optimal parameters
hdbscan_model = hdbscan.HDBSCAN(
  min_cluster_size=optimal_min_cluster_size,
  metric='euclidean',
  cluster_selection_method='eom'
)
hdbscan_model.fit(umap_model.embedding_)

# Get unique cluster labels
cluster_df = df.copy()
cluster_df['cluster'] = hdbscan_model.labels_
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])

# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
color_dict[-1] = 'black'
cluster_labels = unique_labels[unique_labels != -1]
for i, label in enumerate(cluster_labels):
  color_dict[label] = colors[i]

# Plot the tag embeddings with HDBSCAN clusters
plt.figure(figsize=(8, 6))
plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  c=[color_dict[label] for label in hdbscan_model.labels_],
  alpha=0.7,
  s=5,
)

legend_elements = [Patch(facecolor='black', label='Noise (-1)')]
for label in sorted(cluster_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title(f'Tag Embeddings with HDBSCAN Clusters (min_cluster_size={optimal_min_cluster_size})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.tight_layout()
plt.show()

min_cluster_size=5, n_clusters=32, silhouette_score=0.327
min_cluster_size=6, n_clusters=25, silhouette_score=0.361
min_cluster_size=7, n_clusters=20, silhouette_score=0.361
min_cluster_size=8, n_clusters=18, silhouette_score=0.320
min_cluster_size=9, n_clusters=17, silhouette_score=0.299
min_cluster_size=10, n_clusters=10, silhouette_score=0.417
min_cluster_size=11, n_clusters=10, silhouette_score=0.424
min_cluster_size=12, n_clusters=9, silhouette_score=0.428
min_cluster_size=13, n_clusters=9, silhouette_score=0.427
min_cluster_size=14, n_clusters=9, silhouette_score=0.421
min_cluster_size=15, n_clusters=9, silhouette_score=0.407
min_cluster_size=16, n_clusters=9, silhouette_score=0.404
min_cluster_size=17, n_clusters=9, silhouette_score=0.405
min_cluster_size=18, n_clusters=9, silhouette_score=0.404
min_cluster_size=19, n_clusters=8, silhouette_score=0.373
min_cluster_size=20, n_clusters=2, silhouette_score=0.327
min_cluster_size=21, n_clusters=7, silhouette_score=0.331
min_cluster_size=22, n_clusters=6, silhouette_score=0.295
min_cluster_size=23, n_clusters=6, silhouette_score=0.277
min_cluster_size=24, n_clusters=6, silhouette_score=0.274
min_cluster_size=25, n_clusters=6, silhouette_score=0.261
min_cluster_size=26, n_clusters=5, silhouette_score=0.235
min_cluster_size=27, n_clusters=5, silhouette_score=0.232
min_cluster_size=28, n_clusters=3, silhouette_score=0.356
min_cluster_size=29, n_clusters=3, silhouette_score=0.336

Optimal min_cluster_size: 12
Best silhouette score: 0.428

import json

with open('data/raw/cards.json', 'r', encoding='utf-8') as f:
  cards = json.load(f)
  cards_keys = list(cards.keys())
  oracle_id_to_idx = { oid: idx for idx, oid in enumerate(cards_keys) }
with open('data/raw/tags.json', 'r', encoding='utf-8') as f:
  tags = json.load(f)
  tag_keys = { tag['slug']: i for i,tag in enumerate(tags) }
  tag_counts = { tag['slug']: tag['taggingCount'] for tag in tags }
  # Create a mapping of category tags to their descendant tags (tags.json)
  id_to_slug = { tag['id']: tag['slug'] for tag in tags }
  category_tags = { tag['slug']: list(id_to_slug.get(t['tag']['id'])
                                      for t in tag['descendants'])
                   for tag in tags if tag.get('category') }
  with open('data/raw/taggings.json', 'r', encoding='utf-8') as f:
    taggings = json.load(f)

# copied from analyze-tags.py script
def sample_tag_corpora(top_k: int = 100) -> dict[str, set[int]]:
    tag_idxs = {}

    # Category tags: proportional sampling from descendants
    for category_slug, descendant_slugs in category_tags.items():
        if not descendant_slugs: continue
        total_count = sum(len(taggings.get(slug, [])) for slug in descendant_slugs)
        if total_count == 0: continue

        sample_size = {slug: int(len(taggings.get(slug, [])) / total_count * top_k)
                       for slug in descendant_slugs}

        idxs = []
        for slug, count in sample_size.items():
            if slug not in taggings or count <= 0: continue
            tagged_items = np.random.choice(taggings[slug],
                                            size=min(count, len(taggings[slug])),
                                            replace=False)
            if tagged_items.size == 0: continue
            tagged_items = tagged_items.tolist()
            idxs.extend([
                oracle_id_to_idx[item['card']['oracleId']]
                for item in tagged_items
                if item['card']['oracleId'] in oracle_id_to_idx
            ])
        if idxs:
            tag_idxs[category_slug] = set(idxs)

    # Regular tags: sample up to top_k
    for tag_slug, tagged_items in taggings.items():
        idxs = []
        for item in tagged_items[:top_k]:
            oracle_id = item['card']['oracleId']
            if oracle_id in oracle_id_to_idx:
                idxs.append(oracle_id_to_idx[oracle_id])
        if idxs:
            tag_idxs[tag_slug] = set(idxs)

    return tag_idxs

tag_corpora = sample_tag_corpora(100)

# sampled_category_tags = df[df['name'].isin(category_tags.keys())]

# Get a random subset of tags from the 'df' frame
sampled_category_tags = df.sample(n=200, random_state=42)
sampled_category_tags.shape

(200, 1027)

sampled_tagged_cards = []
for tag in sampled_category_tags['name'].unique():
  if tag not in tag_corpora: continue
  card_indices = list(tag_corpora[tag])
  if not card_indices: continue
  sampled_tagged_cards.append(
    card_embeddings.iloc[card_indices].assign(tag=tag)
  )

sampled_tagged_cards = pd.concat(sampled_tagged_cards, ignore_index=True)

sampled_category_tags_emb = sampled_category_tags[embedding_columns].values
sampled_category_tags_2d = umap_model.transform(sampled_category_tags_emb)

sampled_tagged_cards_emb = sampled_tagged_cards[embedding_columns].values
sampled_tagged_cards_2d = umap_model.transform(sampled_tagged_cards_emb)

sampled_category_tags['kmeans_label'] = kmeans.predict(sampled_category_tags_2d)
sampled_tagged_cards['kmeans_label'] = kmeans.predict(sampled_tagged_cards_2d)

# Combine both datasets for consistent spectral clustering
combined_2d = np.vstack([sampled_category_tags_2d, sampled_tagged_cards_2d])
combined_labels = spectral_model.fit_predict(combined_2d)

# Split the labels back to the original datasets
n_tags = len(sampled_category_tags_2d)
sampled_category_tags['spectral_label'] = combined_labels[:n_tags]
sampled_tagged_cards['spectral_label'] = combined_labels[n_tags:]

tag_predict = hdbscan.approximate_predict(hdbscan_model, sampled_category_tags_2d)
card_predict = hdbscan.approximate_predict(hdbscan_model, sampled_tagged_cards_2d)

sampled_category_tags['hdbscan_label'] = tag_predict[0]
sampled_tagged_cards['hdbscan_label'] = card_predict[0]

sampled_category_tags[sampled_category_tags.columns[~sampled_category_tags.columns.str.startswith('emb_')]].head()

sampled_tagged_cards[sampled_tagged_cards.columns[~sampled_tagged_cards.columns.str.startswith('emb_')]].head()

def cluster_match_accuracy(df_cards, df_tags, label_col):
  tag_label_map = dict(zip(df_tags['name'], df_tags[label_col]))
  matches = df_cards.apply(lambda row: row[label_col] == tag_label_map.get(row['tag']), axis=1)
  return matches.mean()

print("KMeans cluster match accuracy:",
      cluster_match_accuracy(sampled_tagged_cards, sampled_category_tags, 'kmeans_label'))
print("Spectral cluster match accuracy:",
      cluster_match_accuracy(sampled_tagged_cards, sampled_category_tags, 'spectral_label'))
print("HDBSCAN cluster match accuracy:",
      cluster_match_accuracy(sampled_tagged_cards, sampled_category_tags, 'hdbscan_label'))

KMeans cluster match accuracy: 0.5084075173095944
Spectral cluster match accuracy: 0.4126277612924497
HDBSCAN cluster match accuracy: 0.5914935707220573

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

def cluster_metrics(df_cards, df_tags, label_col):
  results = []
  for tag in df_tags['name']:
    tag_cluster = df_tags.loc[df_tags['name'] == tag, label_col].iloc[0]
    # True label: 1 if card belongs to this tag, else 0
    y_true = (df_cards['tag'] == tag).astype(int)
    # Predicted: 1 if card's cluster matches tag's cluster, else 0
    y_pred = (df_cards[label_col] == tag_cluster).astype(int)
    # Only compute if there are positives
    if y_true.sum() > 0:
      precision = precision_score(y_true, y_pred, zero_division=0)
      recall = recall_score(y_true, y_pred, zero_division=0)
      f1 = f1_score(y_true, y_pred, zero_division=0)
      try:
        auc = roc_auc_score(y_true, y_pred)
      except ValueError:
        auc = None
    else:
      precision = recall = f1 = auc = None
    results.append({'tag': tag, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc})
  return pd.DataFrame(results).sort_values(by='f1', ascending=False)

kmeans_metrics = cluster_metrics(sampled_tagged_cards, sampled_category_tags, 'kmeans_label')
spectral_metrics = cluster_metrics(sampled_tagged_cards, sampled_category_tags, 'spectral_label')
hdsbcan_metrics = cluster_metrics(sampled_tagged_cards, sampled_category_tags, 'hdbscan_label')

# Get the top 5 tag names by average F1 score across all three
kmeans_metrics_renamed = kmeans_metrics.add_suffix('_kmeans')
spectral_metrics_renamed = spectral_metrics.add_suffix('_spectral')
hdbscan_metrics_renamed = hdsbcan_metrics.add_suffix('_hdbscan')

combined_metrics = kmeans_metrics_renamed.merge(
  spectral_metrics_renamed,
  left_on='tag_kmeans',
  right_on='tag_spectral'
).merge(
  hdbscan_metrics_renamed,
  left_on='tag_kmeans',
  right_on='tag_hdbscan'
)

# Calculate average F1 score across all three methods
combined_metrics['avg_precision'] = combined_metrics[['precision_kmeans', 'precision_spectral', 'precision_hdbscan']].mean(axis=1)
combined_metrics['avg_recall'] = combined_metrics[['recall_kmeans', 'recall_spectral', 'recall_hdbscan']].mean(axis=1)
combined_metrics['avg_f1'] = combined_metrics[['f1_kmeans', 'f1_spectral', 'f1_hdbscan']].mean(axis=1)
combined_metrics['avg_auc'] = combined_metrics[['auc_kmeans', 'auc_spectral', 'auc_hdbscan']].mean(axis=1)

top_tags = combined_metrics.nlargest(50, 'avg_f1')
top_tags = top_tags.rename(columns={'tag_kmeans': 'tag'})

# Add the original tag descriptions
top_tags['description'] = top_tags['tag'].map(df.set_index('name')['description'])

top_tags = top_tags[['tag', 'description', 'avg_precision', 'avg_recall', 'avg_f1', 'f1_kmeans', 'f1_spectral', 'f1_hdbscan', 'avg_auc']]
top_tags.reset_index(inplace=True, drop=True)

top_tags.head(20)

import seaborn as sns

metrics = ['precision', 'recall', 'f1']
methods = ['kmeans', 'spectral', 'hdbscan']
metric_labels = {'precision': 'Precision', 'recall': 'Recall', 'f1': 'F1 Score'}
method_labels = {'kmeans': 'KMeans', 'spectral': 'Spectral', 'hdbscan': 'HDBSCAN'}

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

top_50_metrics = combined_metrics.nlargest(100, 'avg_f1')

for i, metric in enumerate(metrics):
  ax = axes[i]

  for method in methods:
    col_name = f"{metric}_{method}"
    sns.histplot(top_50_metrics[col_name], kde=True, bins=20,
          ax=ax, alpha=0.6, label=method_labels[method], stat='percent')

  ax.set_title(f"{metric_labels[metric]} Distribution")
  ax.set_xlabel(metric_labels[metric])
  ax.set_ylabel("")
  ax.legend()
  ax.grid(True, alpha=0.3)
  ax.set_ylim(0, 30)

  # # Format y-axis ticks to include percentage sign
  ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.0f}%'))

plt.tight_layout()
plt.show()

	id	name	text	emb_0	emb_1	emb_2	emb_3	emb_4	emb_5	emb_6	...	emb_1014	emb_1015	emb_1016	emb_1017	emb_1018	emb_1019	emb_1020	emb_1021	emb_1022	emb_1023
0	00037840-6089-42ec-8c5c-281f9f474504	Nissa, Worldsoul Speaker	Landfall — Whenever a land you control enters,...	-0.063584	0.037000	0.010670	-0.060692	-0.045203	-0.010521	0.029942	...	-0.014268	0.015354	0.006431	-0.023961	-0.014961	0.027579	0.080377	0.017635	0.006588	-0.012813
1	0004ebd0-dfd6-4276-b4a6-de0003e94237	Static Orb	As long as this artifact is untapped, players ...	-0.038534	0.034346	0.003165	0.040716	-0.032311	-0.028376	-0.046908	...	0.034454	-0.016477	0.010473	0.012806	-0.030796	0.020897	0.066356	0.007361	-0.045367	0.009224
2	0006faf6-7a61-426c-9034-579f2cfcfa83	Sensory Deprivation	Enchant creature\nEnchanted creature gets -3/-0.	-0.049110	0.009907	-0.016172	-0.021944	-0.060510	-0.009141	0.017910	...	0.021881	-0.017384	0.015508	-0.007817	0.009093	0.044049	0.067383	0.021220	0.009579	-0.017747
3	0007c283-5b7a-4c00-9ca1-b455c8dff8c3	Road of Return	Choose one —\n• Return target permanent card f...	-0.016619	0.023182	0.002394	0.003111	-0.090463	-0.053103	-0.031972	...	0.014071	-0.013422	-0.000095	-0.045262	-0.034674	-0.013514	0.045766	0.008279	0.012410	-0.008559
4	000d5588-5a4c-434e-988d-396632ade42c	Storm Crow	Flying (This creature can't be blocked except ...	-0.050170	0.022292	-0.007707	0.035540	0.017752	-0.010925	0.037595	...	0.033296	-0.003183	-0.012083	-0.010079	-0.019373	0.037211	0.067933	-0.013548	0.015296	-0.015259

	id	name
0	e506f033-f9d4-4263-b4b7-8a0eecd2cbe1	affinity
1	b9f47bcf-658c-4ccf-ab4b-3fb0391f052f	animate
2	d4c5578c-e9a9-41b5-beb3-0fbcd4068094	banish
3	022df3fc-9720-478d-8b40-755865b70cdb	blue-effect
4	0641a74c-4dd5-426d-be58-2ab86d71995d	burn

	id	name	description	emb_0	emb_1	emb_2	emb_3	emb_4	emb_5	emb_6	...	emb_1014	emb_1015	emb_1016	emb_1017	emb_1018	emb_1019	emb_1020	emb_1021	emb_1022	emb_1023
0	31b256df-2d69-4998-85a2-fdde7b95d466	cycle-ths-god-weapon	NaN	-0.034113	0.018675	0.014336	-0.013073	-0.036177	-0.024673	0.005378	...	0.013687	-0.020731	-0.004290	-0.011961	-0.010511	0.016711	0.065340	0.013900	-0.000859	-0.011380
1	d6359da6-4464-4731-a7a1-16f2b38eb4e8	cycle-mom-draft-signpost	NaN	-0.019731	0.018416	0.016350	-0.016205	-0.023058	-0.036750	0.012292	...	0.009475	-0.012831	0.000833	-0.003301	-0.007723	0.023210	0.072087	0.016457	0.010171	-0.010572
2	acc69593-2540-4f10-b3d0-79afebc5abe1	cycle-khm-m-god	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	ae514071-a901-4cda-90b3-8f49ff7c3a44	typal-devil	NaN	-0.017256	0.006668	0.016094	-0.011581	-0.003440	-0.046818	0.001683	...	0.019980	-0.022219	0.014171	0.010241	-0.018499	0.028761	0.072643	0.011631	0.010490	-0.017172
4	ad0438b6-57e0-4ed4-8577-972281fb3892	cycle-apocalypse-split-card	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	tag	ngram	bm25	gini
0	affinity	tapped creature exile target creature	129.250086	0.933441
1	affinity	turn return target creature card	110.558162	0.867284
2	affinity	life return target creature card	109.784158	0.866445
3	affinity	wizard return target creature card	108.935874	0.865058
4	affinity	targets tapped creature exile target	103.640695	0.973971

	tag	ngram	bm25	gini
1417	hate	create white spirit creature token	127.141643	0.959722
1418	hate	creature deals damage equal	111.067971	0.885240
1419	hate	creature fights target creature token	103.862515	0.843336
1420	hate	destroy target artifact destroy target	103.819382	0.932747
1421	hate	enters return target creature card	97.610903	0.860413

CS N329 - Term Project¶

Dataset Loading¶

Card-Tag Semantic Similarity Analysis¶

Tag Dataset Filtering¶

UMAP Projection¶

KMeans Clustering¶

Spectral Clustering¶

HDBSCAN Clustering¶

Cluster Evaluation¶

	name	description	category	similarity
414	burn	Effects that deal damage, whether to creatures, players, or planeswalkers.	True	0.759430
289	removal	Spot or limited removal. See also [sweeper] for removal effects that remove everything.	True	0.718648
1506	hate	Cards that hate on things — colors, card types, zones, etc.	True	0.701495

	tag	ngram	bm25	gini
200	burn	creature deals damage equal	180.356371	0.450908
201	burn	control deals damage equal	146.810756	0.718176
202	burn	opponent controls deals damage equal	135.097132	0.682989
203	burn	creature deals damage divided	108.498427	0.640603
204	burn	controls deals damage equal	100.830561	0.629047

	tag	ngram	bm25	gini
2517	removal	creature deals damage equal	172.399504	0.692925
2518	removal	control deals damage equal	161.396654	0.745258
2519	removal	destroy target artifact destroy target	149.770545	0.820185
2520	removal	destroy target attacking creature draw	140.049085	0.903502
2521	removal	target attacking creature draw card	136.032233	0.960930

	name	description	category	similarity
1969	cycle-boon	NaN	False	0.826900
855	burn-any	Cards that burn any target: players, creatures, battles, or planeswalkers.	False	0.780491
1978	fractional-life-damage	Cards that dealt ½ damage or gained ½ life from Unhinged	False	0.759388
1720	removal-creature-burn	NaN	False	0.757758

	name	text	similarity
10452	Giant Growth	Target creature gets +3/+3 until end of turn.	0.905424
18341	Brute Force	Target creature gets +3/+3 until end of turn.	0.905424
25255	Infuriate	Target creature gets +3/+2 until end of turn.	0.900170
25960	Dark Remedy	Target creature gets +1/+3 until end of turn.	0.899249
25983	Sangrite Surge	Target creature gets +3/+3 and gains double strike until end of turn.	0.893712
12702	Fit of Rage	Target creature gets +3/+3 and gains first strike until end of turn.	0.893535
18820	Healing Leaves	Choose one —\n• Target player gains 3 life.\n• Prevent the next 3 damage that would be dealt to any target this turn.	0.893396
17017	Healing Salve	Choose one —\n• Target player gains 3 life.\n• Prevent the next 3 damage that would be dealt to any target this turn.	0.893396
29494	Antagonize	Target creature gets +4/+3 until end of turn.	0.893157
26081	Supersize	Target creature gets +3½/+3½ until end of turn.	0.890678
17062	Sudden Strength	Target creature gets +3/+3 until end of turn.\nDraw a card.	0.889123
23533	First Stage of Magic Design	You gain 3 life. Draw three cards. Add {B}{B}{B}. This spell deals 3 damage to any target. Target creature gets +3/+3 until end of turn.	0.881959
30668	Gift of Strength	Target creature gets +3/+3 and gains reach until end of turn.	0.877966
28397	Withstand	Prevent the next 3 damage that would be dealt to any target this turn.\nDraw a card.	0.877767
6304	Secret Rendezvous	You and target opponent each draw three cards.	0.876926
9498	Rebellious Strike	Target creature gets +3/+0 until end of turn.\nDraw a card.	0.873077
10423	Sugar Rush	Target creature gets +3/+0 until end of turn.\nDraw a card.	0.873077
25681	Whiptail Moloch	When this creature enters, it deals 3 damage to target creature you control.	0.873071
10197	Ancestral Recall	Target player draws three cards.	0.872372
6537	A-Deal Gone Bad	Target creature gets -3/-3 until end of turn. Target player mills three cards. You gain 3 life.	0.870827

	id	name	description
900	1ab2a220-a953-4f28-b57a-2c5cb82dde7b	abrade	Modal instant or sorcery spells that offer the...
479	776edc17-b3ed-47f5-aa6b-f6b2b96faab0	affinity-for-humans	NaN
949	9d44c734-2c29-4f4a-9a6b-87c71dff7c7e	afflict	An ability that makes defending players lose l...
1806	b9f47bcf-658c-4ccf-ab4b-3fb0391f052f	animate	Effects that turn things into creatures.
1314	fc71df0d-d1be-4ff7-acd1-e6d8905dca52	animate-artifact	Cards that can turn other noncreature artifact...
...	...	...	...
2082	dc387208-0ac0-4d71-880c-e78ffe2e42dc	type-addition-phyrexian	Gained the Phyrexian type, either after it was...
1705	edc6bdfc-9e57-432a-aacb-c096998206e1	untracked-indefinite-effect	Effects that last forever but aren't tracked b...
1093	76a0dc71-aeb7-4671-8924-3dbb62900b8d	vigilance-counter	NaN
1795	4f7c07de-5bef-4dff-a9f8-64e9afa5add6	virtual-french-vanilla	Creatures and vehicles that are french vanilla...
903	e181b762-73f6-4a38-91fd-2d54b6a210ec	virtual-vanilla	These creatures are effectively just vanilla a...

	id	name	description	kmeans_label	spectral_label	hdbscan_label
2210	0d0f01ac-83fb-4ccb-87d0-ee745ac7f242	conjure-creature	Cards that conjure creatures.	5	11	0
2074	13b9f166-027e-4e66-8844-17e86b5b5d45	removal-aura-bounce	NaN	2	20	-1
2090	63423c00-f269-4fb0-946c-81b5e486e079	hate-haste	NaN	7	9	-1
1001	70a43edc-86e9-4ad9-abf8-7feefee35d5d	removal-equipment	NaN	14	12	3
277	95f33cbb-6b11-403d-b0a4-9c3587cf395e	tutor-creature-goblin	Cards that tutor Goblin cards.	4	19	2

	id	name	text	tag	kmeans_label	spectral_label	hdbscan_label
0	f24d0138-c3d9-48ba-9031-6249808b220b	Grave Choice	Target opponent sacrifices a nontoken creature...	conjure-creature	5	11	0
1	e08a4569-eb90-4a95-9420-ebb8b8c0c906	Gyox, Brutal Carnivora	At the beginning of your end step, put an oil ...	conjure-creature	5	11	0
2	973bd9c2-c2b1-429f-b426-2d722a6a2d63	Sarkhan, Wanderer to Shiv	+1: Dragon cards in your hand perpetually gain...	conjure-creature	5	11	0
3	d2e98c8e-dd2a-4358-81e4-15cb287ea143	Legion Reconsecrator	Whenever Legion Reconsecrator attacks, exile u...	conjure-creature	3	18	8
4	a1f1cadd-6fdb-4cce-8ad5-7ed58d098607	Giant Fire Beetles	Menace, double team (When this creature attack...	conjure-creature	5	11	0

	tag	description	avg_precision	avg_recall	avg_f1	f1_kmeans	f1_spectral	f1_hdbscan	avg_auc
0	conjure-to-hand	Cards that conjure cards to your hand.	0.333889	0.848101	0.479143	0.478571	0.480287	0.478571	0.912888
1	conjure-creature	Cards that conjure creatures.	0.259138	0.732394	0.382823	0.382353	0.383764	0.382353	0.853798
2	banish-creature	NaN	0.237134	0.785714	0.362812	0.347578	0.375839	0.365019	0.877931
3	counterspell-soft	A "soft counterspell" gives the other player a...	0.245834	0.746528	0.356606	0.366255	0.323651	0.379913	0.854224
4	tutor-land-to-battlefield	NaN	0.218170	0.781362	0.329177	0.354701	0.381295	0.251534	0.865540
5	tutor-land-basic	Cards that tutor basic land cards.	0.215406	0.762887	0.324375	0.364407	0.340426	0.268293	0.856397
6	counterspell	Spells that counter stuff. See child tags for ...	0.203668	0.777778	0.320069	0.343816	0.255591	0.360802	0.867285
7	hate-protection	NaN	0.200601	1.000000	0.308892	0.461538	0.461538	0.003599	0.953956
8	bombard	Sacrifice something else to deal N damage. See...	0.175202	0.666667	0.274960	0.302521	0.175355	0.347003	0.810912
9	removal-creature-exile	NaN	0.186897	0.456140	0.263885	0.255319	0.272446	0.263889	0.712104
10	french-vanilla-walker	French vanilla creatures with only a landwalk ...	0.158805	1.000000	0.263489	0.381232	0.341207	0.068027	0.937705
11	removal-enchantment-destroy	NaN	0.161656	0.652921	0.240826	0.274090	0.379421	0.068966	0.763915
12	banish-nonland	NaN	0.139470	0.888889	0.240456	0.208202	0.242424	0.270742	0.927695
13	tutor-cmc	Cards that tutor cards with a certain converte...	0.167763	0.445614	0.233790	0.290429	0.264151	0.146789	0.700198
14	plunder	Sacrifice something else to draw cards.	0.148038	0.518116	0.228401	0.242798	0.222222	0.220183	0.736014
15	removal-enchantment-exile	NaN	0.141037	0.593939	0.227124	0.214286	0.233216	0.233871	0.780223
16	hate-haste	NaN	0.133734	1.000000	0.223023	0.333333	0.333333	0.002401	0.953881
17	hate-reach	NaN	0.133734	1.000000	0.223023	0.333333	0.333333	0.002401	0.953881
18	affinity-for-land-type	Affinity abilities that care about land types.	0.122052	0.791667	0.199814	0.181818	0.028736	0.388889	0.884333
19	regrowth-creature	NaN	0.127050	0.608696	0.199398	0.211288	0.237762	0.149144	0.766657