Source code for mat_discover.utils.generate_elasticity_data

"""Download and partition elasticity data using Materials Project API."""
from os import cpu_count
from os.path import join
from pathlib import Path
import pickle

# from tqdm import tqdm
from pqdm.processes import pqdm

import numpy as np
import pandas as pd


from pymatgen.ext.matproj import MPRester
from pymatgen.core.structure import Structure
from pymatgen.core.composition import Composition


[docs]def structure_from_cif(cif):
    """Create `pymatgen` `Structure` from a crystallographic information file str."""
    return Structure.from_str(cif, fmt="cif")


[docs]def generate_elasticity_data(
    download_data=True,
    cif=False,
    train_e_above_hull=0.05,
    val_e_above_hull=0.05,
    theoretical=False,
    folder=join("mat_discover", "data", "elasticity"),
):
    """Download (or reload) elasticity data using MPRester.

    Parameters
    ----------
    download_data : bool, optional
        [description], by default True
    cif : bool, optional
        [description], by default False
    train_e_above_hull : float, optional
        [description], by default 0.5
    val_e_above_hull : float, optional
        [description], by default 0.05
    theoretical : bool, optional
        Whether a compound is theoretical or not. False means experimental compounds, API subject to change. Can take on values
        False, True, None, or a list of the previous. by default False. See
        https://matsci.org/t/how-to-use-has-icsd-exptl-id-property-in-pymatgen-query-function/2550/4
    folder : str, optional
        Which folder to save to, by default join("mat_discover", "data", "elasticity").

    Returns
    -------
    [type]
        [description]
    """
    if type(theoretical) is not list:
        theoretical = [theoretical]
    # download and save Materials Project dataset
    elast_path = join(folder, "elast_results.pkl")
    all_path = join(folder, "all_results.pkl")

    Path(folder).mkdir(parents=True, exist_ok=True)

    # create Python "module" (for loading data)
    with open(join(folder, "__init__.py"), "w") as f:
        pass

    if download_data:
        # download
        props = ["task_id", "pretty_formula", "elasticity", "cif"]
        if not cif:
            props.remove("cif")
        # fmt: off
        excluded_elements = [
            "He", "Ne", "Ar", "Kr", "Xe", "Rn",
            "U", "Th", "Rn", "Tc", "Po", "Pu", "Pa",
            ]
        # fmt: on
        query = {
            "e_above_hull": {"$lt": train_e_above_hull},
            "elasticity": {"$exists": True},
            "theoretical": {"$in": theoretical},
            "elements": {"$nin": excluded_elements},
        }
        with MPRester() as m:
            elast_results = m.query(query, properties=props, chunk_size=2000)

        props = ["task_id", "pretty_formula", "cif"]
        if not cif:
            props.remove("cif")
        with MPRester() as m:
            all_results = m.query(
                {
                    "e_above_hull": {"$lt": val_e_above_hull},
                    "theoretical": {"$in": theoretical},
                    "elements": {"$nin": excluded_elements},
                },
                properties=props,
                chunk_size=2000,
            )

        # save
        with open(elast_path, "wb") as f:
            pickle.dump(elast_results, f)

        with open(all_path, "wb") as f:
            pickle.dump(all_results, f)
    else:
        # load the data
        with open(elast_path, "rb") as f:
            elast_results = pickle.load(f)

        with open(all_path, "rb") as f:
            all_results = pickle.load(f)

    Path(folder).mkdir(parents=True, exist_ok=True)

    def my_path(name):
        """Return a relative path to a data file."""
        return join(".", folder, name)

    # %% separate mpids and other properties for elasticity materials
    elast_mpids = [d["task_id"] for d in elast_results]
    elast_formulas = [d["pretty_formula"] for d in elast_results]
    if cif:
        elast_cifs = [d["cif"] for d in elast_results]
    elasticity = [d["elasticity"] for d in elast_results]
    K_VRH = [d["K_VRH"] for d in elasticity]

    del elast_results

    elast_comp = [Composition(formula) for formula in elast_formulas]

    elast_struct_path = join(folder, "elast_struct_dicts.pkl")
    if download_data:
        if cif:
            elast_structures = [
                Structure.from_str(cif, fmt="cif") for cif in elast_cifs
            ]
            elast_struct_dicts = [structure.as_dict() for structure in elast_structures]
            with open(elast_struct_path, "wb") as f:
                pickle.dump(elast_struct_dicts, f)
    else:
        if cif:
            with open(elast_struct_path, "rb") as f:
                elast_struct_dicts = pickle.load(f)
            elast_structures = [Structure.from_dict(s) for s in elast_struct_dicts]
            del elast_struct_dicts

    if not cif:
        elast_structures = []
    data = {
        "formula": elast_formulas,
        "composition": elast_comp,
        "structure": elast_structures,
        "K_VRH": K_VRH,
        "task_id": elast_mpids,
        "target": K_VRH,
    }
    if not cif:
        data.pop("structure")

    elast_df = pd.DataFrame(data=data)

    elast_df.to_csv(my_path("train.csv"), columns=["formula", "target"], index=False)

    all_formulas = [d["pretty_formula"] for d in all_results]

    # remove rows corresponding to formulas in val_df that overlap with train_df
    # https://stackoverflow.com/questions/11483863/python-intersection-indices-numpy-array
    indices = np.invert(np.in1d(all_formulas, elast_formulas))

    val_results = [all_results[i] for i in np.nonzero(indices)[0]]

    val_formulas = [d["pretty_formula"] for d in val_results]
    val_mpids = [d["task_id"] for d in val_results]

    if cif:
        val_cifs = [d["cif"] for d in val_results]

    del all_results, val_results

    val_comp = [Composition(formula) for formula in val_formulas]

    val_struct_path = join(folder, "val_struct_dicts.pkl")
    if download_data:
        if cif:
            val_structures = pqdm(val_cifs, structure_from_cif, n_jobs=cpu_count())
            val_struct_dicts = [structure.as_dict() for structure in val_structures]
            with open(val_struct_path, "wb") as f:
                pickle.dump(val_struct_dicts, f)
    else:
        if cif:
            with open(val_struct_path, "rb") as f:
                val_struct_dicts = pickle.load(f)
                val_structures = [Structure.from_dict(s) for s in val_struct_dicts]
                del val_struct_dicts

    if not cif:
        val_structures = []
    data = {
        "composition": val_comp,
        "formula": val_formulas,
        "structure": val_structures,
        "task_id": val_mpids,
        "target": np.zeros((len(val_mpids))),
    }
    if not cif:
        data.pop("structure")

    val_df = pd.DataFrame(data=data)

    val_df.to_csv(my_path("val.csv"), columns=["formula", "target"], index=False)


if __name__ == "__main__":
    generate_elasticity_data()