CS N329 - Term Project¶

Cory Bennett (cjb5497)

Murtaza Husain (smh5845)

In [ ]:
# %pip install -qq gdown && gdown -qO . --folder https://drive.google.com/drive/folders/1qgLH_MERpz3nQyDBwpPPKxxcpYUVJoZy
# %pip install -qq emblaze numpy==2.0.0 hdsbcan umap-learn
Note: you may need to restart the kernel to use updated packages.
In [ ]:
import pandas as pd
import numpy as np

from sklearn.metrics import silhouette_score

import matplotlib.cm as cm
from matplotlib.patches import Patch

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module='sklearn')
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
warnings.filterwarnings("ignore", category=UserWarning, module='umap')

Dataset Loading¶

In [ ]:
from pathlib import Path
import os

DATA_FOLDER = Path(os.path.abspath('')) / 'data' / 'processed'
In [ ]:
card_embeddings = pd.read_csv(DATA_FOLDER / 'card_embeddings.csv')
card_embeddings.head()
Out[ ]:
id name text emb_0 emb_1 emb_2 emb_3 emb_4 emb_5 emb_6 ... emb_1014 emb_1015 emb_1016 emb_1017 emb_1018 emb_1019 emb_1020 emb_1021 emb_1022 emb_1023
0 00037840-6089-42ec-8c5c-281f9f474504 Nissa, Worldsoul Speaker Landfall — Whenever a land you control enters,... -0.063584 0.037000 0.010670 -0.060692 -0.045203 -0.010521 0.029942 ... -0.014268 0.015354 0.006431 -0.023961 -0.014961 0.027579 0.080377 0.017635 0.006588 -0.012813
1 0004ebd0-dfd6-4276-b4a6-de0003e94237 Static Orb As long as this artifact is untapped, players ... -0.038534 0.034346 0.003165 0.040716 -0.032311 -0.028376 -0.046908 ... 0.034454 -0.016477 0.010473 0.012806 -0.030796 0.020897 0.066356 0.007361 -0.045367 0.009224
2 0006faf6-7a61-426c-9034-579f2cfcfa83 Sensory Deprivation Enchant creature\nEnchanted creature gets -3/-0. -0.049110 0.009907 -0.016172 -0.021944 -0.060510 -0.009141 0.017910 ... 0.021881 -0.017384 0.015508 -0.007817 0.009093 0.044049 0.067383 0.021220 0.009579 -0.017747
3 0007c283-5b7a-4c00-9ca1-b455c8dff8c3 Road of Return Choose one —\n• Return target permanent card f... -0.016619 0.023182 0.002394 0.003111 -0.090463 -0.053103 -0.031972 ... 0.014071 -0.013422 -0.000095 -0.045262 -0.034674 -0.013514 0.045766 0.008279 0.012410 -0.008559
4 000d5588-5a4c-434e-988d-396632ade42c Storm Crow Flying (This creature can't be blocked except ... -0.050170 0.022292 -0.007707 0.035540 0.017752 -0.010925 0.037595 ... 0.033296 -0.003183 -0.012083 -0.010079 -0.019373 0.037211 0.067933 -0.013548 0.015296 -0.015259

5 rows × 1027 columns

In [ ]:
category_tags = pd.read_csv(DATA_FOLDER / 'category_tags.csv')
category_tags.head()
Out[ ]:
id name
0 e506f033-f9d4-4263-b4b7-8a0eecd2cbe1 affinity
1 b9f47bcf-658c-4ccf-ab4b-3fb0391f052f animate
2 d4c5578c-e9a9-41b5-beb3-0fbcd4068094 banish
3 022df3fc-9720-478d-8b40-755865b70cdb blue-effect
4 0641a74c-4dd5-426d-be58-2ab86d71995d burn
In [ ]:
tag_embeddings = pd.read_csv(DATA_FOLDER / 'tag_embeddings.csv')
tag_embeddings.head()
Out[ ]:
id name description emb_0 emb_1 emb_2 emb_3 emb_4 emb_5 emb_6 ... emb_1014 emb_1015 emb_1016 emb_1017 emb_1018 emb_1019 emb_1020 emb_1021 emb_1022 emb_1023
0 31b256df-2d69-4998-85a2-fdde7b95d466 cycle-ths-god-weapon NaN -0.034113 0.018675 0.014336 -0.013073 -0.036177 -0.024673 0.005378 ... 0.013687 -0.020731 -0.004290 -0.011961 -0.010511 0.016711 0.065340 0.013900 -0.000859 -0.011380
1 d6359da6-4464-4731-a7a1-16f2b38eb4e8 cycle-mom-draft-signpost NaN -0.019731 0.018416 0.016350 -0.016205 -0.023058 -0.036750 0.012292 ... 0.009475 -0.012831 0.000833 -0.003301 -0.007723 0.023210 0.072087 0.016457 0.010171 -0.010572
2 acc69593-2540-4f10-b3d0-79afebc5abe1 cycle-khm-m-god NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 ae514071-a901-4cda-90b3-8f49ff7c3a44 typal-devil NaN -0.017256 0.006668 0.016094 -0.011581 -0.003440 -0.046818 0.001683 ... 0.019980 -0.022219 0.014171 0.010241 -0.018499 0.028761 0.072643 0.011631 0.010490 -0.017172
4 ad0438b6-57e0-4ed4-8577-972281fb3892 cycle-apocalypse-split-card NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 1027 columns

In [ ]:
tag_ngrams = pd.read_csv(DATA_FOLDER / 'tag_ngrams.csv')
tag_ngrams.head()
Out[ ]:
tag ngram bm25 gini
0 affinity tapped creature exile target creature 129.250086 0.933441
1 affinity turn return target creature card 110.558162 0.867284
2 affinity life return target creature card 109.784158 0.866445
3 affinity wizard return target creature card 108.935874 0.865058
4 affinity targets tapped creature exile target 103.640695 0.973971

Card-Tag Semantic Similarity Analysis¶

In [ ]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_tags(card_name, threshold=0.70):
  # Get embedding for the specified card
  card_emb = card_embeddings.loc[
    card_embeddings['name'] == card_name,
    [c for c in card_embeddings.columns if c.startswith('emb_')]
  ].values

  embedding_columns = [c for c in tag_embeddings.columns if c.startswith('emb_')]
  filtered_tag_embeddings = tag_embeddings.dropna(subset=embedding_columns)
  tag_emb = filtered_tag_embeddings[embedding_columns].values
  similarities = cosine_similarity(card_emb, tag_emb)

  # Select tags above threshold
  similar_tags = filtered_tag_embeddings.loc[
    similarities[0] >= threshold,
    ['name', 'description'] + embedding_columns
  ].copy()
  similar_tags['similarity'] = similarities[0][similarities[0] >= threshold]
  similar_tags = similar_tags.sort_values(by='similarity', ascending=False)

  # Add a column that indicates whether the tag is a category tag
  similar_tags['category'] = similar_tags['name'].isin(category_tags['name'])

  return similar_tags[['name', 'description', 'category', 'similarity']]


card_name = 'Lightning Bolt'
print(f"Tags similar to '{card_name}' with similarity >= 0.70:")
similar_tags = get_similar_tags(card_name)

pd.set_option('display.max_colwidth', None)
similar_tags[similar_tags['category'] == True]
Tags similar to 'Lightning Bolt' with similarity >= 0.70:
Out[ ]:
name description category similarity
414 burn Effects that deal damage, whether to creatures, players, or planeswalkers. True 0.759430
289 removal Spot or limited removal. See also [sweeper] for removal effects that remove everything. True 0.718648
1506 hate Cards that hate on things — colors, card types, zones, etc. True 0.701495

The text for Lightning Bolt simply says "Lightning Bolt deals 3 damage to any target."

Lightning Bolt

This card is often used to deal damage to a player or kill a creature, and so the tags 'burn' and 'removal' are more appropriate than 'hate', which is more about counteracting or preventing a specific strategy or card type. All three are similar semantically, though in reality, 'hate' is likely much more broad and contains a lot of divergent examples.

In [ ]:
tag_ngrams[tag_ngrams['tag'] == 'hate'].nlargest(5, 'bm25')
Out[ ]:
tag ngram bm25 gini
1417 hate create white spirit creature token 127.141643 0.959722
1418 hate creature deals damage equal 111.067971 0.885240
1419 hate creature fights target creature token 103.862515 0.843336
1420 hate destroy target artifact destroy target 103.819382 0.932747
1421 hate enters return target creature card 97.610903 0.860413

The high gini indexes present in the 'hate' tag's ngrams indicates high specificity among ngrams, meaning that the tag is often used with very specific cards or strategies and isn't associated with a broad or consistent concept semantically.

We can surmise that the average embedding we computed for this tag is likely not representative of the tag's usage in general (unless these ngrams are similar to each other semantically). Inspecting these, they appear to describe very different effects, some of which are already covered by 'burn' and 'removal'.

In [ ]:
tag_ngrams[tag_ngrams['tag'] == 'burn'].nlargest(5, 'bm25')
Out[ ]:
tag ngram bm25 gini
200 burn creature deals damage equal 180.356371 0.450908
201 burn control deals damage equal 146.810756 0.718176
202 burn opponent controls deals damage equal 135.097132 0.682989
203 burn creature deals damage divided 108.498427 0.640603
204 burn controls deals damage equal 100.830561 0.629047
In [ ]:
tag_ngrams[tag_ngrams['tag'] == 'removal'].nlargest(5, 'bm25')
Out[ ]:
tag ngram bm25 gini
2517 removal creature deals damage equal 172.399504 0.692925
2518 removal control deals damage equal 161.396654 0.745258
2519 removal destroy target artifact destroy target 149.770545 0.820185
2520 removal destroy target attacking creature draw 140.049085 0.903502
2521 removal target attacking creature draw card 136.032233 0.960930

Comparing 'burn' and 'removal' shows that there are some overlapping ngrams, which is expected, as there exists sub-tags in each that describe similar effects:

In [ ]:
similar_tags[(similar_tags['category'] == False) & (similar_tags['similarity'] >= 0.75)]
Out[ ]:
name description category similarity
1969 cycle-boon NaN False 0.826900
855 burn-any Cards that burn any target: players, creatures, battles, or planeswalkers. False 0.780491
1978 fractional-life-damage Cards that dealt ½ damage or gained ½ life from Unhinged False 0.759388
1720 removal-creature-burn NaN False 0.757758

Though we also see an example of a tag that contains very different effects mechanically but are intended to be semantically similar (in a cycle). According to the MTG Wiki, the 'cycle-boon' tag contains cards with an effect involving the number 3, including Lightning Bolt.

boon-cycle

Though because of the compactness of this cycle, we generally see strong semantic similarity between cards similar to Giant Growth, Lightning Bolt, etc.

In [ ]:
cycle_boon_embedding = tag_embeddings.loc[
  tag_embeddings['name'] == 'cycle-boon',
  [c for c in tag_embeddings.columns if c.startswith('emb_')]
].values

similar_cards = card_embeddings.copy()
similar_cards['similarity'] = cosine_similarity(
  cycle_boon_embedding,
  similar_cards[[c for c in card_embeddings.columns if c.startswith('emb_')]].values
)[0]
similar_cards = similar_cards.sort_values(by='similarity', ascending=False)

similar_cards[['name', 'text', 'similarity']].head(20)
Out[ ]:
name text similarity
10452 Giant Growth Target creature gets +3/+3 until end of turn. 0.905424
18341 Brute Force Target creature gets +3/+3 until end of turn. 0.905424
25255 Infuriate Target creature gets +3/+2 until end of turn. 0.900170
25960 Dark Remedy Target creature gets +1/+3 until end of turn. 0.899249
25983 Sangrite Surge Target creature gets +3/+3 and gains double strike until end of turn. 0.893712
12702 Fit of Rage Target creature gets +3/+3 and gains first strike until end of turn. 0.893535
18820 Healing Leaves Choose one —\n• Target player gains 3 life.\n• Prevent the next 3 damage that would be dealt to any target this turn. 0.893396
17017 Healing Salve Choose one —\n• Target player gains 3 life.\n• Prevent the next 3 damage that would be dealt to any target this turn. 0.893396
29494 Antagonize Target creature gets +4/+3 until end of turn. 0.893157
26081 Supersize Target creature gets +3½/+3½ until end of turn. 0.890678
17062 Sudden Strength Target creature gets +3/+3 until end of turn.\nDraw a card. 0.889123
23533 First Stage of Magic Design You gain 3 life. Draw three cards. Add {B}{B}{B}. This spell deals 3 damage to any target. Target creature gets +3/+3 until end of turn. 0.881959
30668 Gift of Strength Target creature gets +3/+3 and gains reach until end of turn. 0.877966
28397 Withstand Prevent the next 3 damage that would be dealt to any target this turn.\nDraw a card. 0.877767
6304 Secret Rendezvous You and target opponent each draw three cards. 0.876926
9498 Rebellious Strike Target creature gets +3/+0 until end of turn.\nDraw a card. 0.873077
10423 Sugar Rush Target creature gets +3/+0 until end of turn.\nDraw a card. 0.873077
25681 Whiptail Moloch When this creature enters, it deals 3 damage to target creature you control. 0.873071
10197 Ancestral Recall Target player draws three cards. 0.872372
6537 A-Deal Gone Bad Target creature gets -3/-3 until end of turn. Target player mills three cards. You gain 3 life. 0.870827

For our purposes, these tags are not conceptually useful, as they usually refer to a specific set of 5 cards in a set (a cycle). As the effects in these cycles can be any kind of generic effect (as is often the case intended by the game designers), they are not useful for our analysis.

Tag Dataset Filtering¶

In [ ]:
filter_mask = (
  # Filter out cycle tags
  tag_embeddings['name'].str.contains('cycle-') |
  (tag_embeddings['name'] == 'cycle') |
  # Filter out tags that reference obscure types (i.e. typal)
  tag_embeddings['name'].str.contains('typal-') |
  (tag_embeddings['name'] == 'typal') |
  # Filter out tags that reference type erratas
  tag_embeddings['name'].str.contains('type-errata-') |
  tag_embeddings['name'].str.contains('depreciated-') |
  # Filter out tag that reference mechanics from Dungeons & Dragons
  # (These are specific to a D&D crossover set that are intentionally obtuse)
  tag_embeddings['name'].str.startswith('dnd-') |
  (tag_embeddings['name'] == 'dnd') |
  #
  tag_embeddings['name'].str.startswith('un-') |
  tag_embeddings['name'].str.startswith('unstable-') |
  #
  tag_embeddings['name'].str.contains('-name') |
  tag_embeddings['name'].str.contains('type-line') |
  # Filter out specific tags
  tag_embeddings['name'].isin([
    'alliteration',
    'tongue-twister',
    'anagram',
    'namesake-spell',
    'substance',
    'eponymous',
    # Un-set mechanics and tags
    'fractional-life-damage',
    'time-matters',
    'watermark-matters',
    'art-matters',
    'flavor-text-matters',
    'border-color-matters',
    'collector-number-matters',
    'artist-matters',
    'card-style-matters',
  ])
)

df = tag_embeddings[~filter_mask]
embedding_columns = [f'emb_{i}' for i in range(1024)]
df = df.dropna(subset=embedding_columns)

embeddings = df[embedding_columns].values
In [ ]:
embeddings.shape
Out[ ]:
(644, 1024)

UMAP Projection¶

(Interactive viewer with emblaze)

In [ ]:
from emblaze import Viewer, Embedding
from emblaze.utils import Field, ProjectionTechnique
from emblaze.thumbnails import TextThumbnails
from emblaze.datasets import EmbeddingSet


# Generate the tag embeddings projection if it doesn't already exist
emblaze_file = Path('data/tag_embeddings_spectral.json')
if not emblaze_file.exists():
  # Create the Emblaze Embedding object
  emb = Embedding({
    Field.POSITION: embeddings,
    Field.COLOR: df['name'].values,
    Field.RADIUS: np.where(df['id'].isin(category_tags['name'].values), 5, 1),
  })
  emb.compute_neighbors(n_neighbors=5, metric='cosine')

  # Create a TextThumbnails object for the tooltips
  thumbnails = TextThumbnails(
    names=df['name'].values,
    descriptions=df['description'].values
  )

  # Compute a 2D projection of the embeddings using UMAP
  variants = EmbeddingSet([
    emb.project(ProjectionTechnique.UMAP,
                metric='cosine', init='spectral') for _ in range(10)
  ])
  variants.compute_neighbors(metric='cosine')

  viewer = Viewer(embeddings=variants, thumbnails=thumbnails)
  viewer.save_comparison(emblaze_file.as_posix(), overwrite=True)
# Load from file if it exists
else:
  viewer = Viewer(file=emblaze_file.as_posix())
  viewer

(UMAP model with different seeds)

In [ ]:
import umap
import matplotlib.pyplot as plt
In [ ]:
# If the umap_projections file doesn't exist, run
umap_grid_file = Path('umap_projections_grid.png')
if not umap_grid_file.exists():           # Can also try these seeds:
  seeds = [2, 13, 15, 40, 42, 46, 55, 59] #, 60, 69, 73, 76, 84, 88, 89, 94]

  n_cols = 4
  n_rows = (len(seeds) + n_cols - 1) // n_cols

  fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 6))
  axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes

  for i, seed in enumerate(seeds):
    umap_model = umap.UMAP(metric='cosine', init='spectral', random_state=seed)
    umap_model.fit(embeddings)

    ax = axes[i]
    ax.scatter(
      umap_model.embedding_[:, 0],
      umap_model.embedding_[:, 1],
      s=5
    )
    ax.set_title(f'Random State: {seed}')
    ax.set_xlabel('UMAP Component 1')
    ax.set_ylabel('UMAP Component 2')

  # Hide unused subplots
  for i in range(len(seeds), len(axes)):
    axes[i].set_visible(False)

  plt.tight_layout()
  fig.savefig(umap_grid_file.as_posix(), bbox_inches='tight', dpi=300)
  plt.show()
In [ ]:
umap_model = umap.UMAP(metric='cosine', init='spectral', random_state=40)
umap_model.fit(embeddings)

# Plot the UMAP projection
plt.figure(figsize=(10, 6))
plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
)
plt.title('Tag Embeddings UMAP Projection (seed=40)')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()
No description has been provided for this image
In [ ]:
sample_size = 1000
sample_cards = card_embeddings.sample(sample_size, random_state=0)
sample_embeddings = sample_cards[embedding_columns].values

sample_embedding_2d = umap_model.transform(sample_embeddings)
In [ ]:
umap_card_model = umap.UMAP(metric='cosine', init='spectral', random_state=40)
umap_card_model.fit(sample_embeddings)

tag_embeddings_in_card_space = umap_card_model.transform(embeddings)

KMeans Clustering¶

In [ ]:
from sklearn.cluster import MiniBatchKMeans

cluster_range = range(5, 30)
silhouette_scores = []
for n_clusters in cluster_range:
  kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0)
  cluster_labels = kmeans.fit_predict(umap_model.embedding_)
  score = silhouette_score(umap_model.embedding_, cluster_labels)
  silhouette_scores.append(score)

optimal_n_clusters = cluster_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_n_clusters}")

# Plot the silhouette scores
plt.figure(figsize=(6, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('K-Means: Optimal Number of Clusters')
plt.axvline(x=optimal_n_clusters, color='red', linestyle='--',
            label=f'Optimal: {optimal_n_clusters}')
plt.legend()
plt.show()

# Perform k-means clustering with the optimal number of clusters
kmeans = MiniBatchKMeans(n_clusters=optimal_n_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(umap_model.embedding_)

# Create a color map for the clusters
cluster_df = df.copy()
cluster_df['cluster'] = cluster_labels
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])

# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
for i, label in enumerate(unique_labels):
  color_dict[label] = colors[i]

point_colors = [color_dict[label] for label in cluster_df['cluster']]

# Plot the UMAP projection with optimal clustering
plt.figure(figsize=(8, 6))
plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  c=point_colors,
  s=5,
)

legend_elements = [Patch(color=color_dict[label], label=f'Cluster {label}') for label in unique_labels]
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title(f'Tag Embeddings UMAP Projection with MiniBatch K-Means (n_clusters={optimal_n_clusters})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()
Optimal number of clusters: 19
No description has been provided for this image
No description has been provided for this image
In [ ]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
  umap_card_model.embedding_[:, 0],
  umap_card_model.embedding_[:, 1],
  c='black',
  s=5,
  alpha=0.1,
  label='Cards'
)
axes[1].scatter(
  tag_embeddings_in_card_space[:, 0],
  tag_embeddings_in_card_space[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')

# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  s=5,
  label='Cards',
  c='tab:gray',
  alpha=0.1
)
axes[0].scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')

# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)

# Add legend for right plot
axes[1].legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# Predict labels for the card embeddings using the fitted spectral model
cluster_labels = kmeans.fit_predict(sample_embedding_2d)

cluster_df = sample_cards.copy()
cluster_df['cluster'] = kmeans.labels_
point_colors = [color_dict[label] for label in cluster_df['cluster']]

# Plot the sample embeddings with cluster labels
plt.figure(figsize=(8, 6))
plt.scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  c=point_colors,
  s=5,
)
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title(f'Card Embeddings with MiniBatch K-Means (n_clusters={optimal_n_clusters})')
plt.xlabel('Embedding Component 1')
plt.ylabel('Embedding Component 2')
plt.show()
No description has been provided for this image

Spectral Clustering¶

In [ ]:
from sklearn.cluster import SpectralClustering

# Grid search for optimal number of clusters
cluster_range = range(5, 30)
silhouette_scores = []
for n_clusters in cluster_range:
  spectral_model = SpectralClustering(
    n_clusters=n_clusters,
    affinity='nearest_neighbors',
    random_state=40
  )
  cluster_labels = spectral_model.fit_predict(umap_model.embedding_)
  score = silhouette_score(umap_model.embedding_, cluster_labels)
  silhouette_scores.append(score)
  print(f"n_clusters={n_clusters}, silhouette_score={score:.3f}")

optimal_n_clusters = cluster_range[np.argmax(silhouette_scores)]
best_score = max(silhouette_scores)
print(f"\nOptimal number of clusters: {optimal_n_clusters}")
print(f"Best silhouette score: {best_score:.3f}")

# Plot the silhouette scores
plt.figure(figsize=(6, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Spectral Clustering: Optimal Number of Clusters')
plt.axvline(x=optimal_n_clusters, color='red', linestyle='--',
            label=f'Optimal: {optimal_n_clusters}')
plt.legend()
plt.show()

# Fit final model with optimal parameters
spectral_model = SpectralClustering(
  n_clusters=optimal_n_clusters,
  affinity='nearest_neighbors',
  random_state=40
)
spectral_model.fit(umap_model.embedding_)

cluster_df = df.copy()
cluster_df['cluster'] = spectral_model.labels_
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])

# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
for i, label in enumerate(unique_labels):
  color_dict[label] = colors[i]

point_colors = [color_dict[label] for label in cluster_df['cluster']]

# Plot the UMAP projection with optimal clustering
plt.figure(figsize=(10, 6))
plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  c=point_colors,
  s=5,
)

legend_elements = []
for label in sorted(unique_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title(f'Tag Embeddings UMAP Projection with Spectral Clustering (n_clusters={optimal_n_clusters})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()
n_clusters=5, silhouette_score=0.432
n_clusters=6, silhouette_score=0.381
n_clusters=7, silhouette_score=0.375
n_clusters=8, silhouette_score=0.386
n_clusters=9, silhouette_score=0.415
n_clusters=10, silhouette_score=0.459
n_clusters=11, silhouette_score=0.325
n_clusters=12, silhouette_score=0.436
n_clusters=13, silhouette_score=0.473
n_clusters=14, silhouette_score=0.435
n_clusters=15, silhouette_score=0.459
n_clusters=16, silhouette_score=0.480
n_clusters=17, silhouette_score=0.496
n_clusters=18, silhouette_score=0.506
n_clusters=19, silhouette_score=0.493
n_clusters=20, silhouette_score=0.474
n_clusters=21, silhouette_score=0.464
n_clusters=22, silhouette_score=0.477
n_clusters=23, silhouette_score=0.482
n_clusters=24, silhouette_score=0.511
n_clusters=25, silhouette_score=0.520
n_clusters=26, silhouette_score=0.516
n_clusters=27, silhouette_score=0.496
n_clusters=28, silhouette_score=0.500
n_clusters=29, silhouette_score=0.497

Optimal number of clusters: 25
Best silhouette score: 0.520
No description has been provided for this image
No description has been provided for this image
In [ ]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
  umap_card_model.embedding_[:, 0],
  umap_card_model.embedding_[:, 1],
  c='black',
  s=5,
  alpha=0.1,
  label='Cards'
)
axes[1].scatter(
  tag_embeddings_in_card_space[:, 0],
  tag_embeddings_in_card_space[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')

# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  s=5,
  label='Cards',
  c='tab:gray',
  alpha=0.1
)
axes[0].scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')

# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)

legend_elements = []
for label in sorted(cluster_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

# Add legend for right plot
axes[1].legend(handles=legend_elements + [Patch(facecolor='grey', label='Cards')],
                bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
  umap_card_model.embedding_[:, 0],
  umap_card_model.embedding_[:, 1],
  c='black',
  s=5,
  alpha=0.1,
  label='Cards'
)
axes[1].scatter(
  tag_embeddings_in_card_space[:, 0],
  tag_embeddings_in_card_space[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')

# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  s=5,
  label='Cards',
  c='tab:gray',
  alpha=0.1
)
axes[0].scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')

# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)

legend_elements = []
for label in sorted(cluster_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

# Add legend for right plot
axes[1].legend(handles=legend_elements + [Patch(facecolor='grey', label='Cards')],
                bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

HDBSCAN Clustering¶

In [ ]:
import hdbscan

# Use HDBSCAN to cluster the embeddings
min_cluster_size=10
hdbscan_model = hdbscan.HDBSCAN(
  min_cluster_size=min_cluster_size,
  metric='euclidean',
  cluster_selection_method='eom',
  prediction_data=True
)
hdbscan_model.fit(umap_model.embedding_)

# Get unique cluster labels
cluster_df = df.copy()
cluster_df['cluster'] = hdbscan_model.labels_
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])

plt.figure(figsize=(8, 6))

# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
color_dict[-1] = 'black'
cluster_labels = unique_labels[unique_labels != -1]
for i, label in enumerate(cluster_labels):
  color_dict[label] = colors[i]

point_colors = [color_dict[label] for label in cluster_df['cluster']]

plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  c=point_colors,
  s=5
)

legend_elements = [Patch(facecolor='black', label='Noise (-1)')]
for label in sorted(cluster_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title('UMAP Projection (seed=40) - HDBSCAN Clusters')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# Show tags from the main cluster (label 9)
cluster_df[cluster_df['cluster'] == 9][['id', 'name', 'description']].sort_values(by='name')
Out[ ]:
id name description
900 1ab2a220-a953-4f28-b57a-2c5cb82dde7b abrade Modal instant or sorcery spells that offer the...
479 776edc17-b3ed-47f5-aa6b-f6b2b96faab0 affinity-for-humans NaN
949 9d44c734-2c29-4f4a-9a6b-87c71dff7c7e afflict An ability that makes defending players lose l...
1806 b9f47bcf-658c-4ccf-ab4b-3fb0391f052f animate Effects that turn things into creatures.
1314 fc71df0d-d1be-4ff7-acd1-e6d8905dca52 animate-artifact Cards that can turn other noncreature artifact...
... ... ... ...
2082 dc387208-0ac0-4d71-880c-e78ffe2e42dc type-addition-phyrexian Gained the Phyrexian type, either after it was...
1705 edc6bdfc-9e57-432a-aacb-c096998206e1 untracked-indefinite-effect Effects that last forever but aren't tracked b...
1093 76a0dc71-aeb7-4671-8924-3dbb62900b8d vigilance-counter NaN
1795 4f7c07de-5bef-4dff-a9f8-64e9afa5add6 virtual-french-vanilla Creatures and vehicles that are french vanilla...
903 e181b762-73f6-4a38-91fd-2d54b6a210ec virtual-vanilla These creatures are effectively just vanilla a...

254 rows × 3 columns

In [ ]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: Transform tag embeddings using card-fitted UMAP model
axes[1].scatter(
  umap_card_model.embedding_[:, 0],
  umap_card_model.embedding_[:, 1],
  c='black',
  s=5,
  alpha=0.1,
  label='Cards'
)
axes[1].scatter(
  tag_embeddings_in_card_space[:, 0],
  tag_embeddings_in_card_space[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[1].set_title('Tag Embeddings in Card UMAP Space (seed=40)')
axes[1].set_xlabel('UMAP Component 1')
axes[1].set_ylabel('UMAP Component 2')

# Right plot: Card embeddings transformed by tag-fitted UMAP model
axes[0].scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  s=5,
  label='Cards',
  c='tab:gray',
  alpha=0.1
)
axes[0].scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  s=5,
  label='Tags',
  c=point_colors,
  alpha=0.7
)
axes[0].set_title('Card Embeddings in Tag UMAP Space (seed=40)')
axes[0].set_xlabel('UMAP Component 1')
axes[0].set_ylabel('UMAP Component 2')

# Set same axes range for both plots (using the right plot's range)
x_lim = axes[0].get_xlim()
y_lim = axes[0].get_ylim()
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)

# Add legend for right plot
axes[1].legend(handles=legend_elements + [Patch(facecolor='grey', label='Cards')],
                bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# Re-use the original hdbscan model to predict clusters on the sample embeddings
approximate_predict = hdbscan.approximate_predict(
  hdbscan_model, umap_card_model.transform(sample_embeddings)
)

plt.figure(figsize=(8, 6))
plt.scatter(
  sample_embedding_2d[:, 0],
  sample_embedding_2d[:, 1],
  # Re-use the same color mapping from before
  c=[color_dict[label] for label in approximate_predict[0]],
  s=5,
  alpha=0.7
)
plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title('Card Embeddings with HDBSCAN Clusters (seed=40)')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# Define the parameter grid for min_cluster_size
min_cluster_sizes = list(range(5, 30))
silhouette_scores = []

for min_cluster_size in min_cluster_sizes:
  hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=min_cluster_size,
    metric='euclidean',
    cluster_selection_method='eom'
  )
  cluster_labels = hdbscan_model.fit_predict(umap_model.embedding_)

  # Only calculate silhouette score if we have more than one cluster (excluding noise)
  unique_labels = np.unique(cluster_labels)
  n_clusters = len(unique_labels[unique_labels != -1])

  if n_clusters > 1:
    score = silhouette_score(umap_model.embedding_, cluster_labels)
    silhouette_scores.append(score)
    print(f"min_cluster_size={min_cluster_size}, n_clusters={n_clusters}, silhouette_score={score:.3f}")
  else:
    silhouette_scores.append(-1)  # Invalid score for single cluster
    print(f"min_cluster_size={min_cluster_size}, n_clusters={n_clusters}, silhouette_score=N/A")

optimal_idx = np.argmax(silhouette_scores)
optimal_min_cluster_size = min_cluster_sizes[optimal_idx]
best_score = silhouette_scores[optimal_idx]
print(f"\nOptimal min_cluster_size: {optimal_min_cluster_size}")
print(f"Best silhouette score: {best_score:.3f}")

# Plot the silhouette scores for different min_cluster_sizes
plt.figure(figsize=(6, 6))
plt.plot(min_cluster_sizes, silhouette_scores, marker='o')
plt.xlabel('Min Cluster Size')
plt.ylabel('Silhouette Score')
plt.title('HDBSCAN: Optimal Min Cluster Size')
plt.axvline(x=optimal_min_cluster_size, color='red', linestyle='--',
      label=f'Optimal: {optimal_min_cluster_size}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Fit final HDBSCAN model with optimal parameters
hdbscan_model = hdbscan.HDBSCAN(
  min_cluster_size=optimal_min_cluster_size,
  metric='euclidean',
  cluster_selection_method='eom'
)
hdbscan_model.fit(umap_model.embedding_)

# Get unique cluster labels
cluster_df = df.copy()
cluster_df['cluster'] = hdbscan_model.labels_
unique_labels = np.unique(cluster_df['cluster'])
n_clusters = len(unique_labels[unique_labels != -1])

# Create color map
colors = cm.Spectral(np.linspace(0, 1, n_clusters))
color_dict = {}
color_dict[-1] = 'black'
cluster_labels = unique_labels[unique_labels != -1]
for i, label in enumerate(cluster_labels):
  color_dict[label] = colors[i]

# Plot the tag embeddings with HDBSCAN clusters
plt.figure(figsize=(8, 6))
plt.scatter(
  umap_model.embedding_[:, 0],
  umap_model.embedding_[:, 1],
  c=[color_dict[label] for label in hdbscan_model.labels_],
  alpha=0.7,
  s=5,
)

legend_elements = [Patch(facecolor='black', label='Noise (-1)')]
for label in sorted(cluster_labels):
  legend_elements.append(Patch(facecolor=color_dict[label], label=f'Cluster {label}'))

plt.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title(f'Tag Embeddings with HDBSCAN Clusters (min_cluster_size={optimal_min_cluster_size})')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.tight_layout()
plt.show()
min_cluster_size=5, n_clusters=32, silhouette_score=0.327
min_cluster_size=6, n_clusters=25, silhouette_score=0.361
min_cluster_size=7, n_clusters=20, silhouette_score=0.361
min_cluster_size=8, n_clusters=18, silhouette_score=0.320
min_cluster_size=9, n_clusters=17, silhouette_score=0.299
min_cluster_size=10, n_clusters=10, silhouette_score=0.417
min_cluster_size=11, n_clusters=10, silhouette_score=0.424
min_cluster_size=12, n_clusters=9, silhouette_score=0.428
min_cluster_size=13, n_clusters=9, silhouette_score=0.427
min_cluster_size=14, n_clusters=9, silhouette_score=0.421
min_cluster_size=15, n_clusters=9, silhouette_score=0.407
min_cluster_size=16, n_clusters=9, silhouette_score=0.404
min_cluster_size=17, n_clusters=9, silhouette_score=0.405
min_cluster_size=18, n_clusters=9, silhouette_score=0.404
min_cluster_size=19, n_clusters=8, silhouette_score=0.373
min_cluster_size=20, n_clusters=2, silhouette_score=0.327
min_cluster_size=21, n_clusters=7, silhouette_score=0.331
min_cluster_size=22, n_clusters=6, silhouette_score=0.295
min_cluster_size=23, n_clusters=6, silhouette_score=0.277
min_cluster_size=24, n_clusters=6, silhouette_score=0.274
min_cluster_size=25, n_clusters=6, silhouette_score=0.261
min_cluster_size=26, n_clusters=5, silhouette_score=0.235
min_cluster_size=27, n_clusters=5, silhouette_score=0.232
min_cluster_size=28, n_clusters=3, silhouette_score=0.356
min_cluster_size=29, n_clusters=3, silhouette_score=0.336

Optimal min_cluster_size: 12
Best silhouette score: 0.428
No description has been provided for this image
No description has been provided for this image

Cluster Evaluation¶

Get ground truth tag labels for the original tagged cards

In [ ]:
import json

with open('data/raw/cards.json', 'r', encoding='utf-8') as f:
  cards = json.load(f)
  cards_keys = list(cards.keys())
  oracle_id_to_idx = { oid: idx for idx, oid in enumerate(cards_keys) }
with open('data/raw/tags.json', 'r', encoding='utf-8') as f:
  tags = json.load(f)
  tag_keys = { tag['slug']: i for i,tag in enumerate(tags) }
  tag_counts = { tag['slug']: tag['taggingCount'] for tag in tags }
  # Create a mapping of category tags to their descendant tags (tags.json)
  id_to_slug = { tag['id']: tag['slug'] for tag in tags }
  category_tags = { tag['slug']: list(id_to_slug.get(t['tag']['id'])
                                      for t in tag['descendants'])
                   for tag in tags if tag.get('category') }
  with open('data/raw/taggings.json', 'r', encoding='utf-8') as f:
    taggings = json.load(f)

# copied from analyze-tags.py script
def sample_tag_corpora(top_k: int = 100) -> dict[str, set[int]]:
    tag_idxs = {}

    # Category tags: proportional sampling from descendants
    for category_slug, descendant_slugs in category_tags.items():
        if not descendant_slugs: continue
        total_count = sum(len(taggings.get(slug, [])) for slug in descendant_slugs)
        if total_count == 0: continue

        sample_size = {slug: int(len(taggings.get(slug, [])) / total_count * top_k)
                       for slug in descendant_slugs}

        idxs = []
        for slug, count in sample_size.items():
            if slug not in taggings or count <= 0: continue
            tagged_items = np.random.choice(taggings[slug],
                                            size=min(count, len(taggings[slug])),
                                            replace=False)
            if tagged_items.size == 0: continue
            tagged_items = tagged_items.tolist()
            idxs.extend([
                oracle_id_to_idx[item['card']['oracleId']]
                for item in tagged_items
                if item['card']['oracleId'] in oracle_id_to_idx
            ])
        if idxs:
            tag_idxs[category_slug] = set(idxs)

    # Regular tags: sample up to top_k
    for tag_slug, tagged_items in taggings.items():
        idxs = []
        for item in tagged_items[:top_k]:
            oracle_id = item['card']['oracleId']
            if oracle_id in oracle_id_to_idx:
                idxs.append(oracle_id_to_idx[oracle_id])
        if idxs:
            tag_idxs[tag_slug] = set(idxs)

    return tag_idxs

tag_corpora = sample_tag_corpora(100)
In [ ]:
# sampled_category_tags = df[df['name'].isin(category_tags.keys())]

# Get a random subset of tags from the 'df' frame
sampled_category_tags = df.sample(n=200, random_state=42)
sampled_category_tags.shape
Out[ ]:
(200, 1027)
In [ ]:
sampled_tagged_cards = []
for tag in sampled_category_tags['name'].unique():
  if tag not in tag_corpora: continue
  card_indices = list(tag_corpora[tag])
  if not card_indices: continue
  sampled_tagged_cards.append(
    card_embeddings.iloc[card_indices].assign(tag=tag)
  )

sampled_tagged_cards = pd.concat(sampled_tagged_cards, ignore_index=True)

Get the cluster labels for each of the sampled category tags and tagged cards.

In [ ]:
sampled_category_tags_emb = sampled_category_tags[embedding_columns].values
sampled_category_tags_2d = umap_model.transform(sampled_category_tags_emb)

sampled_tagged_cards_emb = sampled_tagged_cards[embedding_columns].values
sampled_tagged_cards_2d = umap_model.transform(sampled_tagged_cards_emb)
In [ ]:
sampled_category_tags['kmeans_label'] = kmeans.predict(sampled_category_tags_2d)
sampled_tagged_cards['kmeans_label'] = kmeans.predict(sampled_tagged_cards_2d)

# Combine both datasets for consistent spectral clustering
combined_2d = np.vstack([sampled_category_tags_2d, sampled_tagged_cards_2d])
combined_labels = spectral_model.fit_predict(combined_2d)

# Split the labels back to the original datasets
n_tags = len(sampled_category_tags_2d)
sampled_category_tags['spectral_label'] = combined_labels[:n_tags]
sampled_tagged_cards['spectral_label'] = combined_labels[n_tags:]

tag_predict = hdbscan.approximate_predict(hdbscan_model, sampled_category_tags_2d)
card_predict = hdbscan.approximate_predict(hdbscan_model, sampled_tagged_cards_2d)

sampled_category_tags['hdbscan_label'] = tag_predict[0]
sampled_tagged_cards['hdbscan_label'] = card_predict[0]
In [ ]:
sampled_category_tags[sampled_category_tags.columns[~sampled_category_tags.columns.str.startswith('emb_')]].head()
Out[ ]:
id name description kmeans_label spectral_label hdbscan_label
2210 0d0f01ac-83fb-4ccb-87d0-ee745ac7f242 conjure-creature Cards that conjure creatures. 5 11 0
2074 13b9f166-027e-4e66-8844-17e86b5b5d45 removal-aura-bounce NaN 2 20 -1
2090 63423c00-f269-4fb0-946c-81b5e486e079 hate-haste NaN 7 9 -1
1001 70a43edc-86e9-4ad9-abf8-7feefee35d5d removal-equipment NaN 14 12 3
277 95f33cbb-6b11-403d-b0a4-9c3587cf395e tutor-creature-goblin Cards that tutor Goblin cards. 4 19 2
In [ ]:
sampled_tagged_cards[sampled_tagged_cards.columns[~sampled_tagged_cards.columns.str.startswith('emb_')]].head()
Out[ ]:
id name text tag kmeans_label spectral_label hdbscan_label
0 f24d0138-c3d9-48ba-9031-6249808b220b Grave Choice Target opponent sacrifices a nontoken creature... conjure-creature 5 11 0
1 e08a4569-eb90-4a95-9420-ebb8b8c0c906 Gyox, Brutal Carnivora At the beginning of your end step, put an oil ... conjure-creature 5 11 0
2 973bd9c2-c2b1-429f-b426-2d722a6a2d63 Sarkhan, Wanderer to Shiv +1: Dragon cards in your hand perpetually gain... conjure-creature 5 11 0
3 d2e98c8e-dd2a-4358-81e4-15cb287ea143 Legion Reconsecrator Whenever Legion Reconsecrator attacks, exile u... conjure-creature 3 18 8
4 a1f1cadd-6fdb-4cce-8ad5-7ed58d098607 Giant Fire Beetles Menace, double team (When this creature attack... conjure-creature 5 11 0

Compare tag labels to card labels to see how well the clustering algorithms preserved the original tags.

In [ ]:
def cluster_match_accuracy(df_cards, df_tags, label_col):
  tag_label_map = dict(zip(df_tags['name'], df_tags[label_col]))
  matches = df_cards.apply(lambda row: row[label_col] == tag_label_map.get(row['tag']), axis=1)
  return matches.mean()

print("KMeans cluster match accuracy:",
      cluster_match_accuracy(sampled_tagged_cards, sampled_category_tags, 'kmeans_label'))
print("Spectral cluster match accuracy:",
      cluster_match_accuracy(sampled_tagged_cards, sampled_category_tags, 'spectral_label'))
print("HDBSCAN cluster match accuracy:",
      cluster_match_accuracy(sampled_tagged_cards, sampled_category_tags, 'hdbscan_label'))
KMeans cluster match accuracy: 0.5084075173095944
Spectral cluster match accuracy: 0.4126277612924497
HDBSCAN cluster match accuracy: 0.5914935707220573
In [ ]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

def cluster_metrics(df_cards, df_tags, label_col):
  results = []
  for tag in df_tags['name']:
    tag_cluster = df_tags.loc[df_tags['name'] == tag, label_col].iloc[0]
    # True label: 1 if card belongs to this tag, else 0
    y_true = (df_cards['tag'] == tag).astype(int)
    # Predicted: 1 if card's cluster matches tag's cluster, else 0
    y_pred = (df_cards[label_col] == tag_cluster).astype(int)
    # Only compute if there are positives
    if y_true.sum() > 0:
      precision = precision_score(y_true, y_pred, zero_division=0)
      recall = recall_score(y_true, y_pred, zero_division=0)
      f1 = f1_score(y_true, y_pred, zero_division=0)
      try:
        auc = roc_auc_score(y_true, y_pred)
      except ValueError:
        auc = None
    else:
      precision = recall = f1 = auc = None
    results.append({'tag': tag, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc})
  return pd.DataFrame(results).sort_values(by='f1', ascending=False)

kmeans_metrics = cluster_metrics(sampled_tagged_cards, sampled_category_tags, 'kmeans_label')
spectral_metrics = cluster_metrics(sampled_tagged_cards, sampled_category_tags, 'spectral_label')
hdsbcan_metrics = cluster_metrics(sampled_tagged_cards, sampled_category_tags, 'hdbscan_label')

# Get the top 5 tag names by average F1 score across all three
kmeans_metrics_renamed = kmeans_metrics.add_suffix('_kmeans')
spectral_metrics_renamed = spectral_metrics.add_suffix('_spectral')
hdbscan_metrics_renamed = hdsbcan_metrics.add_suffix('_hdbscan')

combined_metrics = kmeans_metrics_renamed.merge(
  spectral_metrics_renamed,
  left_on='tag_kmeans',
  right_on='tag_spectral'
).merge(
  hdbscan_metrics_renamed,
  left_on='tag_kmeans',
  right_on='tag_hdbscan'
)

# Calculate average F1 score across all three methods
combined_metrics['avg_precision'] = combined_metrics[['precision_kmeans', 'precision_spectral', 'precision_hdbscan']].mean(axis=1)
combined_metrics['avg_recall'] = combined_metrics[['recall_kmeans', 'recall_spectral', 'recall_hdbscan']].mean(axis=1)
combined_metrics['avg_f1'] = combined_metrics[['f1_kmeans', 'f1_spectral', 'f1_hdbscan']].mean(axis=1)
combined_metrics['avg_auc'] = combined_metrics[['auc_kmeans', 'auc_spectral', 'auc_hdbscan']].mean(axis=1)

top_tags = combined_metrics.nlargest(50, 'avg_f1')
top_tags = top_tags.rename(columns={'tag_kmeans': 'tag'})

# Add the original tag descriptions
top_tags['description'] = top_tags['tag'].map(df.set_index('name')['description'])

top_tags = top_tags[['tag', 'description', 'avg_precision', 'avg_recall', 'avg_f1', 'f1_kmeans', 'f1_spectral', 'f1_hdbscan', 'avg_auc']]
top_tags.reset_index(inplace=True, drop=True)
In [ ]:
top_tags.head(20)
Out[ ]:
tag description avg_precision avg_recall avg_f1 f1_kmeans f1_spectral f1_hdbscan avg_auc
0 conjure-to-hand Cards that conjure cards to your hand. 0.333889 0.848101 0.479143 0.478571 0.480287 0.478571 0.912888
1 conjure-creature Cards that conjure creatures. 0.259138 0.732394 0.382823 0.382353 0.383764 0.382353 0.853798
2 banish-creature NaN 0.237134 0.785714 0.362812 0.347578 0.375839 0.365019 0.877931
3 counterspell-soft A "soft counterspell" gives the other player a... 0.245834 0.746528 0.356606 0.366255 0.323651 0.379913 0.854224
4 tutor-land-to-battlefield NaN 0.218170 0.781362 0.329177 0.354701 0.381295 0.251534 0.865540
5 tutor-land-basic Cards that tutor basic land cards. 0.215406 0.762887 0.324375 0.364407 0.340426 0.268293 0.856397
6 counterspell Spells that counter stuff. See child tags for ... 0.203668 0.777778 0.320069 0.343816 0.255591 0.360802 0.867285
7 hate-protection NaN 0.200601 1.000000 0.308892 0.461538 0.461538 0.003599 0.953956
8 bombard Sacrifice something else to deal N damage. See... 0.175202 0.666667 0.274960 0.302521 0.175355 0.347003 0.810912
9 removal-creature-exile NaN 0.186897 0.456140 0.263885 0.255319 0.272446 0.263889 0.712104
10 french-vanilla-walker French vanilla creatures with only a landwalk ... 0.158805 1.000000 0.263489 0.381232 0.341207 0.068027 0.937705
11 removal-enchantment-destroy NaN 0.161656 0.652921 0.240826 0.274090 0.379421 0.068966 0.763915
12 banish-nonland NaN 0.139470 0.888889 0.240456 0.208202 0.242424 0.270742 0.927695
13 tutor-cmc Cards that tutor cards with a certain converte... 0.167763 0.445614 0.233790 0.290429 0.264151 0.146789 0.700198
14 plunder Sacrifice something else to draw cards. 0.148038 0.518116 0.228401 0.242798 0.222222 0.220183 0.736014
15 removal-enchantment-exile NaN 0.141037 0.593939 0.227124 0.214286 0.233216 0.233871 0.780223
16 hate-haste NaN 0.133734 1.000000 0.223023 0.333333 0.333333 0.002401 0.953881
17 hate-reach NaN 0.133734 1.000000 0.223023 0.333333 0.333333 0.002401 0.953881
18 affinity-for-land-type Affinity abilities that care about land types. 0.122052 0.791667 0.199814 0.181818 0.028736 0.388889 0.884333
19 regrowth-creature NaN 0.127050 0.608696 0.199398 0.211288 0.237762 0.149144 0.766657
In [ ]:
import seaborn as sns

metrics = ['precision', 'recall', 'f1']
methods = ['kmeans', 'spectral', 'hdbscan']
metric_labels = {'precision': 'Precision', 'recall': 'Recall', 'f1': 'F1 Score'}
method_labels = {'kmeans': 'KMeans', 'spectral': 'Spectral', 'hdbscan': 'HDBSCAN'}

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

top_50_metrics = combined_metrics.nlargest(100, 'avg_f1')

for i, metric in enumerate(metrics):
  ax = axes[i]

  for method in methods:
    col_name = f"{metric}_{method}"
    sns.histplot(top_50_metrics[col_name], kde=True, bins=20,
          ax=ax, alpha=0.6, label=method_labels[method], stat='percent')

  ax.set_title(f"{metric_labels[metric]} Distribution")
  ax.set_xlabel(metric_labels[metric])
  ax.set_ylabel("")
  ax.legend()
  ax.grid(True, alpha=0.3)
  ax.set_ylim(0, 30)

  # # Format y-axis ticks to include percentage sign
  ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.0f}%'))

plt.tight_layout()
plt.show()
No description has been provided for this image