Examples

See also

Basic Usage

Import, instantiate, fit, predict, plot, save, and print top-10 ranked compounds in 7 lines. Instructions for importing custom datasets and using other built-in datasets (via CrabNet.data.materials_data) and downloading data from Materials Project.

Reproduce Paper Results

This simple example for running mat_discover was used to produce the figures in the first DiSCoVeR paper:

"""Load some data, fit Discover(), predict on validation data, make some plots, and save the model."""
# %% imports
import pandas as pd
from crabnet.data.materials_data import elasticity
from mat_discover.mat_discover_ import Discover

# %% setup
# set dummy to True for a quicker run --> small dataset, MDS instead of UMAP
dummy = False
# set gcv to False for a quicker run --> group-cross validation can take a while
gcv = False
disc = Discover(dummy_run=dummy, device="cuda", target_unit="GPa")
train_df, val_df = disc.data(elasticity, fname="train.csv", dummy=dummy)
cat_df = pd.concat((train_df, val_df), axis=0)

# %% fit
disc.fit(train_df)

# %% predict
score = disc.predict(val_df, umap_random_state=42)

# %% leave-one-cluster-out cross-validation
if gcv:
    disc.group_cross_val(cat_df, umap_random_state=42)
    print("scaled test error = ", disc.scaled_error)

# %% plot and save
disc.plot()
disc.save(dummy=dummy)

Hyperparameter Combinations

You can test the effect of various hyperparameters (e.g. Scaler, pred_weight, and proxy_weight) on the model outputs:

"""Train Discover() using different Scalers and relative weights and create outputs.

# TODO: replace with a more sophisticated ML "experiment" tracker.
"""
# %% imports
from os.path import join
from sklearn.preprocessing import MinMaxScaler, RobustScaler, PowerTransformer
from crabnet.data.materials_data import elasticity
from mat_discover.mat_discover_ import Discover, cdf_sorting_error

# %% setup
# set dummy to True for a quicker run --> small dataset, MDS instead of UMAP
dummy = False
# set gcv to False for a quicker run --> group-cross validation can take a while
gcv = False

Scalers = [MinMaxScaler, RobustScaler, PowerTransformer]
weight_pairs = [(1, 0), (2, 1), (1, 1), (1, 2), (0, 1)]

disc = Discover(dummy_run=dummy, device="cuda")
train_df, val_df = disc.data(elasticity, "train.csv", dummy=dummy)

# %% Loop through parameter combinations
param_combs = {}
dens_score_dfs = {}
peak_score_dfs = {}
comb_out_dfs = {}

for Scaler in Scalers:
    for weight_pair in weight_pairs:
        # extract weights
        pred_weight, proxy_weight = weight_pair

        # print information
        print("=" * 20 + "PARAMETERS" + "=" * 20)
        print(
            disc.Scaler.__name__,
            ", pred_weight: ",
            disc.pred_weight,
            ", proxy_weight: ",
            disc.proxy_weight,
        )
        print("=" * 50)

        param_combs.append(
            {
                "Scaler": disc.Scaler.__name__,
                "pred_weight": disc.pred_weight,
                "proxy_weight": disc.proxy_weight,
            }
        )

        disc = Discover(
            figure_dir=join(
                "examples",
                "hyperparameter_combinations",
                "figures_"
                + Scaler.__name__
                + "_weights_"
                + str(weight_pair[0])
                + "-"
                + str(weight_pair[1]),
            ),  # e.g. "figures_RobustScaler_weights_0-1"
            table_dir=join(
                "examples",
                "hyperparameter_combinations",
                "tables_"
                + Scaler.__name__
                + "_weights_"
                + str(weight_pair[0])
                + "-"
                + str(weight_pair[1]),
            ),  # e.g. "tables_RobustScaler_weights_0-1"
            dummy_run=dummy,
            device="cuda",
            Scaler=Scaler,
            pred_weight=weight_pair[0],
            proxy_weight=weight_pair[1],
            target_unit="GPa",
        )

        disc.fit(train_df)
        score = disc.predict(val_df, umap_random_state=42)

        dens_score_dfs.append(
            {f"{Scaler.__name__}_{pred_weight}_{proxy_weight}": disc.dens_score_df}
        )

        peak_score_dfs.append(
            {f"{Scaler.__name__}_{pred_weight}_{proxy_weight}": disc.peak_score_df}
        )

        comb_out_dfs.append(
            {f"{Scaler.__name__}_{pred_weight}_{proxy_weight}": disc.comb_out_df}
        )

        print(disc.dens_score_df.head(100))
        print(disc.peak_score_df.head(100))
        print(disc.comb_out_df)
        disc.plot()

# cdf_sorting_error
# create heatmaps

Varying these parameters (Scaler, pred_weight, and proxy_weight) affect the unscaled target predictions nor the unscaled proxy values. These parameters only affect how the scores/rankings are produced, and therefore do not affect the default plots that are produced. The tables which contain the top-100 rankings for each hyperparameter combination are given in hyperparameter_combinations/

Without Dimension Reduction

By removing the UMAP dimensionality reduction step, about twice as many clusters are produced, and the rate of unclassified points increases from ~5% to ~25%.

"""Perform clustering without dimensionality reduction and check noise proportion."""
# %% imports
from operator import attrgetter
import numpy as np
import pandas as pd
from crabnet.data.materials_data import elasticity
import hdbscan
from chem_wasserstein.ElM2D_ import ElM2D
from mat_discover.mat_discover_ import Discover

# %% setup
# set dummy to True for a quicker run --> small dataset, MDS instead of UMAP
dummy = False
# set gcv to False for a quicker run --> group-cross validation can take a while
gcv = False
disc = Discover(dummy_run=dummy, device="cuda")
train_df, val_df = disc.data(elasticity, "train.csv", dummy=dummy)
cat_df = pd.concat((train_df, val_df), axis=0)

all_formula = cat_df["formula"]
mapper = ElM2D()
mapper.fit(all_formula)
# https://github.com/scikit-learn-contrib/hdbscan/issues/71
dm = mapper.dm.astype(np.float64)

min_cluster_size = 50
min_samples = 1

clusterer = hdbscan.HDBSCAN(
    min_samples=min_samples,
    cluster_selection_epsilon=0.63,
    min_cluster_size=min_cluster_size,
    metric="precomputed",
).fit(dm)
labels, probabilities = attrgetter("labels_", "probabilities_")(clusterer)

# np.unique while preserving order https://stackoverflow.com/a/12926989/13697228
lbl_ids = np.unique(labels, return_index=True)[1]
unique_labels = [labels[lbl_id] for lbl_id in sorted(lbl_ids)]

nclusters = len(lbl_ids)
print("nclusters: ", nclusters)

class_ids = labels != -1
unclass_ids = np.invert(class_ids)
unclass_frac = np.sum(unclass_ids) / len(labels)

print("unclass_frac: ", unclass_frac)

Real Predictions

In other words, moving past the validation study into a practical use-case. The practical use-case shown here takes the Materials Project compounds which contain elasticity data and those which do not as training and validation data, respectively. This is a work-in-progress.

"""Load some data, fit Discover(), predict on validation data (three chunks), make some plots, and save the model."""
# %% imports
from os.path import join
import numpy as np
from math import floor

# from crabnet.data.materials_data import elasticity
from mat_discover.data import elasticity_exp_50meV_noRadio as elasticity
from mat_discover.mat_discover_ import Discover

# %% instantiate
# set dummy to True for a quicker run --> small dataset, MDS instead of UMAP
dummy = False
disc = Discover(
    dummy_run=dummy, device="cuda", dist_device="cpu", nscores=1000, target_unit="GPa"
)

# %% data
train_df = disc.data(elasticity, "train.csv", split=False, dummy=dummy)
val_df = disc.data(elasticity, "val.csv", split=False, dummy=dummy)

# remove rows corresponding to formulas in val_df that overlap with train_df
indices = np.invert(np.in1d(val_df.formula, train_df.formula))
val_df = val_df.iloc[indices]

# split into 3 chunks
# HACK: "hardcoded" for less than ~100000 total compounds, and max 10000 training (mem)
val_df = val_df.sample(frac=1, random_state=42)
val_rows, train_rows = val_df.shape[0], train_df.shape[0]
max_rows = 40000  # due to memory constraints of a 40000x40000 matrix (~12 GB)
# Solve[n == ((val_rows + n train_rows) + 40000)/40000, n] via Mathematica
n = floor((-max_rows - val_rows) / (-max_rows + train_rows))
val_dfs = np.array_split(val_df, n)
# val_df = val_df[:30000]

table_dirs = [join("tables", val) for val in ["val1", "val2", "val3"]]
figure_dirs = [join("figures", val) for val in ["val1", "val2", "val3"]]

# %% fit, predict, plot
disc.fit(train_df)
for i, val_df in enumerate(val_dfs):
    # change output dirs
    disc.table_dir = table_dirs[i]
    disc.figure_dir = figure_dirs[i]

    # %% predict
    score = disc.predict(val_df, umap_random_state=42)

    # %% plot and save
    disc.plot()
    disc.save(dummy=dummy)

# %% Code Graveyard
# set gcv to False for a quicker run --> group-cross validation can take a while
# gcv = False
#     # %% leave-one-cluster-out cross-validation
#     if gcv:
#         disc.group_cross_val(cat_df, umap_random_state=42)
#         print("scaled test error = ", disc.scaled_error)

# np.random.seed(42)
# val_df = np.random.shuffle(val_df)

Adaptive Design Comparison

Use the Adapt() class to perform adaptive design for several hyperparameter combinations and compare against random search.

"""Compare DiSCoVeR to random search."""
# %% imports
import dill as pickle
from os.path import join
import numpy as np

from mat_discover.utils.extraordinary import (
    extraordinary_split,
    extraordinary_histogram,
)

from crabnet.data.materials_data import elasticity
from mat_discover.utils.data import data
from mat_discover.adaptive_design import Adapt

from mat_discover.utils.plotting import matplotlibify
from plotly.subplots import make_subplots
from plotly import offline
import plotly.graph_objects as go

# %% setup
train_df, val_df = data(elasticity, "train.csv", dummy=False, random_state=42)
train_df, val_df, extraordinary_thresh = extraordinary_split(
    train_df, val_df, train_size=100, extraordinary_percentile=0.98, random_state=42
)
np.random.seed(42)
# REVIEW: why do I think this RNG affects anything downstream? (CrabNet, yes, but I'm
# having trouble thinking of where else an RNG would have an effect, other than
# rand_experiments, which makes me think - why do multiple repeats for the real ones?)

# set dummy to True for a quicker run --> small dataset, MDS instead of UMAP
dummy_run = False
if dummy_run:
    val_df = val_df.iloc[:100]
    n_iter = 3
    n_repeats = 1
else:
    n_iter = 900  # of objective function evaluations (e.g. wet-lab synthesis)
    n_repeats = 1

name_mapper = {"target": "Bulk Modulus (GPa)"}
extraordinary_histogram(train_df, val_df, labels=name_mapper)

rand_experiments = []

for i in range(n_repeats + 4):
    print(f"[RANDOM-EXPERIMENT: {i}]")
    adapt = Adapt(
        train_df,
        val_df,
        timed=False,
        dummy_run=dummy_run,
        device="cpu",
        dist_device="cpu",
    )
    rand_experiments.append(
        adapt.closed_loop_adaptive_design(
            n_experiments=n_iter, random_search=True, print_experiment=False
        )
    )

novelty_experiments = []
for i in range(n_repeats):
    print(f"[NOVELTY-EXPERIMENT: {i}]")
    adapt = Adapt(
        train_df,
        val_df,
        timed=False,
        dummy_run=dummy_run,
        device="cuda",
        dist_device="cuda",
        pred_weight=0,
    )
    novelty_experiments.append(
        adapt.closed_loop_adaptive_design(n_experiments=n_iter, print_experiment=False)
    )

equal_experiments = []
for i in range(n_repeats):
    print(f"[EQUAL-EXPERIMENT: {i}]")
    adapt = Adapt(
        train_df,
        val_df,
        timed=False,
        dummy_run=dummy_run,
        device="cuda",
        dist_device="cuda",
    )
    equal_experiments.append(
        adapt.closed_loop_adaptive_design(n_experiments=n_iter, print_experiment=False)
    )

performance_experiments = []
for i in range(n_repeats):
    print(f"[PERFORMANCE-EXPERIMENT: {i}]")
    adapt = Adapt(
        train_df,
        val_df,
        timed=False,
        dummy_run=dummy_run,
        device="cuda",
        dist_device="cuda",
        proxy_weight=0,
    )
    performance_experiments.append(
        adapt.closed_loop_adaptive_design(n_experiments=n_iter, print_experiment=False)
    )

experiments = [
    rand_experiments,
    novelty_experiments,
    equal_experiments,
    performance_experiments,
    # performance_experiments_check,
]

y_names = ["cummax", "target", "cumthresh", "n_unique_atoms", "n_unique_templates"]

rows = len(y_names)
cols = len(experiments)

x = list(range(n_iter))
y = np.zeros((rows, cols, n_repeats + 4, n_iter))
formula = rows * [cols * [(n_repeats + 4) * [None]]]
for col, experiment in enumerate(experiments):
    for row, y_name in enumerate(y_names):
        for page, sub_experiment in enumerate(experiment):
            y[row, col, page] = sub_experiment[y_name].values.tolist()
            formula[row][col][page] = sub_experiment["formula"].values.tolist()

labels = {
    "_index": "adaptive design iteration",
    "target": "Bulk Modulus (GPa)",
    "cummax": "Cumulative Max (GPa)",
    "cumthresh": "Cumulative Extraordinary (#)",
    "n_unique_atoms": "Unique Atoms (#)",
    "n_unique_templates": "Unique Chemical Templates (#)",
}
y_names = [labels[name] for name in y_names]

# def extraordinary_subplots():
fig = make_subplots(
    rows=rows, cols=cols, shared_xaxes=True, shared_yaxes=True, vertical_spacing=0.02
)

x_pars = ["Random", "Novelty", "50/50", "Performance"]
# x_pars = ["Random", "Performance", "Performance Check"]
col_nums = [str(i) for i in range((rows - 1) * cols + 1, rows * cols + 1)]
row_nums = [""] + [str(i) for i in list(range(cols + 1, rows * cols, cols))]

colors = ["red", "black", "green", "blue"]
for row in range(rows):
    for col in range(cols):
        color = colors[col]
        for page in range(n_repeats + 4):
            # if col == 0 or page == 0:
            if page == 0:
                fig.append_trace(
                    go.Scatter(
                        x=x,
                        y=y[row, col, page],
                        line=dict(color=color),
                        text=formula[row][col][page],
                        hovertemplate="Formula: %{text} <br>Iteration: %{x} <br>y: %{y}",
                    ),
                    row=row + 1,
                    col=col + 1,
                )
for col_num, x_par in zip(col_nums, x_pars):
    fig["layout"][f"xaxis{col_num}"]["title"] = f"{x_par} AD Iteration (#)"

for row_num, y_name in zip(row_nums, y_names):
    fig["layout"][f"yaxis{row_num}"]["title"] = y_name

fig.update_traces(showlegend=False)
fig.update_layout(height=300 * rows, width=300 * cols)
# https://stackoverflow.com/a/62215075/13697228
offline.plot(fig)
# fig.show()

fig.write_html(join("figures", "ad-compare.html"))


fig2, scale = matplotlibify(
    fig, size=28, width_inches=3.5 * cols, height_inches=3.5 * rows
)
fig2.write_image(join("figures", "ad-compare.png"))

with open(join("data", "rand_novelty_equal_performance.pkl"), "wb") as f:
    pickle.dump(experiments, f)

# TODO: val RMSE vs. iteration
# TODO: elemental prevalence distribution (periodic table?)
# TODO: chemical template distribution (need a package)
# TODO: 3 different parameter weightings
# TODO: interactive, selectable interface to change what is displayed
# REVIEW: should I also include RobustScaler vs. MinMaxScaler?

Bare Bones

Looking to implement DiSCoVeR yourself or better understand it, but without all the Python class trickery and imports from multiple files?

"""A self-contained, bare-bones example of the DiSCoVeR algorithm.

Steps
-----
1. Load some data
2. CrabNet target predictions
3. ElM2D distance calculations
4. DensMAP embeddings and densities
5. Train contribution to validation density
6. Nearest neighbor properties
7. Calculation of weighted scores
"""
# %% Imports
from operator import attrgetter

import numpy as np
import pandas as pd

from scipy.stats import multivariate_normal
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import RobustScaler

import umap

from chem_wasserstein.ElM2D_ import ElM2D
from crabnet.crabnet_ import CrabNet
from crabnet.data.materials_data import elasticity
from crabnet.utils.data import get_data

# %% 1. Data
train_df, val_df = get_data(elasticity, fname="train.csv")

# %% 2. CrabNet predictions
crabnet_model = CrabNet(losscurve=False, learningcurve=False)
crabnet_model.fit(train_df)

train_pred, train_sigma, train_true = crabnet_model.predict(
    train_df, return_uncertainty=True, return_true=True
)

val_pred, val_sigma, val_true = crabnet_model.predict(
    val_df, return_uncertainty=True, return_true=True
)

pred = np.concatenate((train_pred, val_pred), axis=0)

val_rmse = mean_squared_error(val_true, val_pred, squared=False)

print("val RMSE: ", val_rmse)

# %% Setup
train_formula = train_df["formula"]
train_target = train_df["target"]
val_formula = val_df["formula"]
val_target = val_df["target"]

all_formula = pd.concat((train_formula, val_formula), axis=0)
all_target = pd.concat((train_target, val_target), axis=0)

ntrain, nval = len(train_formula), len(val_formula)
ntot = ntrain + nval
train_ids, val_ids = np.arange(ntrain), np.arange(ntrain, ntot)

# %% 3. Distance calculations
mapper = ElM2D()
mapper.fit(all_formula)
dm = mapper.dm

# %% 4. DensMAP embeddings and densities
umap_trans = umap.UMAP(
    densmap=True,
    output_dens=True,
    dens_lambda=1.0,
    n_neighbors=30,
    min_dist=0,
    n_components=2,
    metric="precomputed",
    random_state=42,
    low_memory=False,
).fit(dm)


# Extract densMAP embedding and radii
umap_emb, r_orig_log, r_emb_log = attrgetter("embedding_", "rad_orig_", "rad_emb_")(
    umap_trans
)
umap_r_orig = np.exp(r_orig_log)

# %% 5. Train contribution to validation density
train_emb = umap_emb[:ntrain]
train_r_orig = umap_r_orig[:ntrain]
val_emb = umap_emb[ntrain:]
val_r_orig = umap_r_orig[ntrain:]

train_df["emb"] = list(map(tuple, train_emb))
train_df["r_orig"] = train_r_orig
val_df["emb"] = list(map(tuple, val_emb))
val_df["r_orig"] = val_r_orig


def my_mvn(mu_x, mu_y, r):
    """Calculate multivariate normal at (mu_x, mu_y) with constant radius, r."""
    return multivariate_normal([mu_x, mu_y], [[r, 0], [0, r]])


mvn_list = list(map(my_mvn, train_emb[:, 0], train_emb[:, 1], train_r_orig))
pdf_list = [mvn.pdf(val_emb) for mvn in mvn_list]
val_dens = np.sum(pdf_list, axis=0)
val_log_dens = np.log(val_dens)

val_df["dens"] = val_dens

# %% 6. Nearest neighbor calculations
r_strength = 1.5
mean, std = (np.mean(dm), np.std(dm))
radius = mean - r_strength * std
n_neighbors = 10
NN = NearestNeighbors(radius=radius, n_neighbors=n_neighbors, metric="precomputed")
NN.fit(dm)

neigh_ind = NN.kneighbors(return_distance=False)
num_neigh = n_neighbors * np.ones(neigh_ind.shape[0])

neigh_target = np.array([pred[ind] for ind in neigh_ind], dtype="object")
k_neigh_avg_targ = np.array(
    [np.mean(t) if len(t) > 0 else float(0) for t in neigh_target]
)

val_k_neigh_avg = k_neigh_avg_targ[val_ids]


# %% 7. Weighted scores
def weighted_score(pred, proxy, pred_weight=1.0, proxy_weight=1.0):
    """Calculate weighted discovery score using the predicted target and proxy."""
    pred = pred.ravel().reshape(-1, 1)
    proxy = proxy.ravel().reshape(-1, 1)
    # Scale and weight the cluster data
    pred_scaler = RobustScaler().fit(pred)
    pred_scaled = pred_weight * pred_scaler.transform(pred)
    proxy_scaler = RobustScaler().fit(-1 * proxy)
    proxy_scaled = proxy_weight * proxy_scaler.transform(-1 * proxy)

    # combined cluster data
    comb_data = pred_scaled + proxy_scaled
    comb_scaler = RobustScaler().fit(comb_data)

    # cluster scores range between 0 and 1
    score = comb_scaler.transform(comb_data).ravel()
    return score


peak_score = weighted_score(val_pred, val_k_neigh_avg)
dens_score = weighted_score(val_pred, val_dens)

Cluster Plot

You may be primarily interested in generating cluster labels for your data and visualizing the clusters. For this example, a Google Colab version is included for ease of use Cluster Plot.

"""Load some data, fit Discover(), predict on validation data, and plot."""
# %% imports
from os.path import join
import pandas as pd
from crabnet.data.materials_data import elasticity
from mat_discover.mat_discover_ import Discover
from mat_discover.utils.pareto import pareto_plot

# %% setup
# set dummy to True for a quicker run --> small dataset, MDS instead of UMAP
dummy = False
disc = Discover(dummy_run=dummy, device="cuda")
train_df, val_df = disc.data(elasticity, fname="train.csv", dummy=dummy)
cat_df = pd.concat((train_df, val_df), axis=0)

# %% fit
disc.fit(train_df)

# %% predict
score = disc.predict(val_df, umap_random_state=42)

# Interactive scatter plot colored by clusters
x = "DensMAP Dim. 1"
y = "DensMAP Dim. 2"
umap_df = pd.DataFrame(
    {
        x: disc.std_emb[:, 0],
        y: disc.std_emb[:, 1],
        "cluster ID": disc.labels,
        "formula": disc.all_inputs,
    }
)
fig = pareto_plot(
    umap_df,
    x=x,
    y=y,
    color="cluster ID",
    fpath=join(disc.figure_dir, "px-umap-cluster-scatter"),
    pareto_front=False,
    parity_type=None,
)