Source code for chemplot.chemplot

# Authors: Murat Cihan Sorkun <mcsorkun@gmail.com>, Dajt Mullaj <dajt.mullai@gmail.com>
#
# License: BSD 3 clause 
from __future__ import print_function

import chemplot.descriptors as desc
import chemplot.parameters as parameters
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import base64
import functools

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from pandas.api.types import is_numeric_dtype
from rdkit.Chem import Draw
from bokeh.plotting import figure
from bokeh.transform import transform, factor_cmap
from bokeh.palettes import Category10, Inferno, Spectral4
from bokeh.models.mappers import LinearColorMapper
from bokeh.models import ColorBar, HoverTool, Panel, Tabs
from bokeh.io import output_file, save, show
from scipy import stats
from io import BytesIO

def calltracker(func):
    @functools.wraps(func)
    def wrapper(*args):
        wrapper.has_been_called = True
        return func(*args)
    wrapper.has_been_called = False
    return wrapper

[docs]class Plotter(object): """ A class used to plot the ECFP fingerprints of the molecules used to instantiate it. :param __sim_type: similarity type structural or tailored :param __target_type: target type R (regression) or C (classificatino) :param __target: list containing the target values. Is empty if a target does not exist :param __mols: list of valid molecules that can be plotted :param __df_descriptors: datatframe containing the descriptors representation of each molecule :param __df_2_components: dataframe containing the two-dimenstional representation of each molecule :param __plot_title: title of the plot reflecting the dimensionality reduction algorithm used :param __data: list of the scaled descriptors to which the dimensionality reduction algorithm is applied :param pca_fit: PCA object created when the corresponding algorithm is applied to the data :param tsne_fit: t-SNE object created when the corresponding algorithm is applied to the data :param umap_fit: UMAP object created when the corresponding algorithm is applied to the data :param df_plot_xy: dataframe containing the coordinates that have been plotted :type __sim_type: string :type __target_type: string :type __target: list :type __mols: rdkit.Chem.rdchem.Mol :type __df_descriptors: Dataframe :type __df_2_components: Dataframe :type __plot_title: string :type __data: list :type pca_fit: sklearn.decomposition.TSNE :type tsne_fit: sklearn.manifold.TSNE :type umap_fit: umap.umap_.UMAP :type df_plot_xy: Dataframe """ _static_plots = {'scatter', 'hex', 'kde'} _interactive_plots = {'scatter', 'hex'} _sim_types = {'tailored', 'structural'} _target_types = {'R', 'C'} def __init__(self, encoding_list, target, target_type, sim_type, get_desc, get_fingerprints): # Error handeling sym_type if sim_type not in self._sim_types: if len(target) > 0: self.__sim_type = 'tailored' print('sim_type indicates the similarity type by which the plots are constructed.\n' + 'The supported similarity types are structural and tailored.\n' + 'Because a target list has been provided \'tailored\' as been selected as sym_type.') else: self.__sim_type = 'structural' print('sim_type indicates the similarity type by which the plots are constructed.\n' + 'The supported similarity types are structural and tailored.\n' + 'Because no target list has been provided \'structural\' as been selected as sym_type.') else: self.__sim_type = sim_type if self.__sim_type != "structural" and len(target) == 0: raise Exception("Target values missing") # Error handeling target_type if len(target) > 0: if len(target) != len(encoding_list): raise Exception("If target is provided its length must match the instances of molecules") if len(target) > 0: df_target = pd.DataFrame(data=target) unique_targets_ratio = 1.*df_target.iloc[:, 0].nunique()/df_target.iloc[:, 0].count() < 0.05 numeric_target = is_numeric_dtype(df_target.dtypes[0]) if target_type == 'R' and (unique_targets_ratio or not numeric_target): print('Input received is \'R\' for target values that seem not continuous.') if target_type not in self._target_types: if not unique_targets_ratio and numeric_target: self.__target_type = 'R' print('target_type indicates if the target is a continuous variable or a class label.\n'+ 'R stands for regression and C for classification. Input R as target type for continuous variables and C for class labels.\n'+ 'From analysis of the target, R has been selected for target_type.') else: self.__target_type = 'C' print('target_type indicates if the target is a continuous variable or a class label.\n'+ 'R stands for regression and C for classification. Input R as target type for continuous variables and C for class labels.\n'+ 'From analysis of the target, C has been selected for target_type.') else: self.__target_type = target_type else: self.__target_type = None if len(target) > 0 and self.__target_type == 'C': df_target = pd.DataFrame(data=target) if df_target.iloc[:, 0].nunique() == 1: target = [] self.__sim_type = "structural" print("Only one class found in the targets") # Instantiate Plotter class if self.__sim_type == "tailored": self.__mols, df_descriptors, target = get_desc(encoding_list, target) if df_descriptors.empty: raise Exception("Descriptors could not be computed for given molecules") self.__df_descriptors, self.__target = desc.select_descriptors_lasso(df_descriptors,target,kind=self.__target_type) elif self.__sim_type == "structural": self.__mols, self.__df_descriptors, self.__target = get_fingerprints(encoding_list,target,2,2048) if len(self.__mols) < 2 or len(self.__df_descriptors.columns) < 2: raise Exception("Plotter object cannot be instantiated for given molecules") self.__df_2_components = None self.__plot_title = None
[docs] @classmethod def from_smiles(cls, smiles_list, target=[], target_type=None, sim_type=None): """ Class method to construct a Plotter object from a list of SMILES. :param smile_list: List of the SMILES representation of the molecules to plot. :param target: target values :param target_type: target type R (regression) or C (classificatino) :param sim_type: similarity type structural or tailored :type smile_list: list :type target: list :type target_type: string :type sim_type: string :returns: A Plotter object for the molecules given as input. :rtype: Plotter """ return cls(smiles_list, target, target_type, sim_type, desc.get_mordred_descriptors, desc.get_ecfp)
[docs] @classmethod def from_inchi(cls, inchi_list, target=[], target_type=None, sim_type=None): """ Class method to construct a Plotter object from a list of InChi. :param inchi_list: List of the InChi representation of the molecules to plot. :type inchi_list: dict :param target: target values :type target: dict :param target_type: target type R (regression) or C (classificatino) :type target_type: string :param sim_type: similarity type structural or tailored :type sim_type: string :returns: A Plotter object for the molecules given as input. :rtype: Plotter """ return cls(inchi_list, target, target_type, sim_type, desc.get_mordred_descriptors_from_inchi, desc.get_ecfp_from_inchi)
[docs] def pca(self, **kwargs): """ Calculates the first 2 PCA components of the molecular descriptors. :param kwargs: Other keyword arguments are passed down to sklearn.decomposition.PCA :type kwargs: key, value mappings :returns: The dataframe containing the PCA components. :rtype: Dataframe """ self.__data = self.__data_scaler() # Linear dimensionality reduction to 2 components by PCA self.pca_fit = PCA(n_components=2, **kwargs) first2ecpf_components = self.pca_fit.fit_transform(self.__data) coverage_components = self.pca_fit.explained_variance_ratio_ # Create labels for the plot first_component = "PC-1 (" + "{:.0%}".format(coverage_components[0]) + ")" second_component = "PC-2 (" + "{:.0%}".format(coverage_components[1]) + ")" # Create a dataframe containinting the first 2 PCA components of ECFP self.__df_2_components = pd.DataFrame(data = first2ecpf_components , columns = [first_component, second_component]) self.__plot_title = "PCA plot" if len(self.__target) > 0: self.__df_2_components['target'] = self.__target return self.__df_2_components.copy()
[docs] def tsne(self, perplexity=None, pca=False, random_state=None, **kwargs): """ Calculates the first 2 t-SNE components of the molecular descriptors. :param perplexity: perplexity value for the t-SNE model :param pca: indicates if the features must be preprocessed by PCA :param random_state: random seed that can be passed as a parameter for reproducing the same results :param kwargs: Other keyword arguments are passed down to sklearn.manifold.TSNE :type perplexity: int :type pca: boolean :type random_state: int :type kwargs: key, value mappings :returns: The dataframe containing the t-SNE components. :rtype: Dataframe """ self.__data = self.__data_scaler() self.__plot_title = "t-SNE plot" # Preprocess the data with PCA if pca and self.__sim_type == "structural": pca = PCA(n_components=10, random_state=random_state) self.__data = pca.fit_transform(self.__data) self.__plot_title = "t-SNE plot from components with cumulative variance explained " + "{:.0%}".format(sum(pca.explained_variance_ratio_)) else: self.__plot_title = "t-SNE plot" # Get the perplexity of the model if perplexity is None: if self.__sim_type == "structural": if pca: perplexity = parameters.perplexity_structural_pca(len(self.__data)) else: perplexity = parameters.perplexity_structural(len(self.__data)) else: perplexity = parameters.perplexity_tailored(len(self.__data)) else: if perplexity<5 or perplexity>50: print('Robust results are obtained for values of perplexity between 5 and 50') # Embed the data in two dimensions self.tsne_fit = TSNE(n_components=2, perplexity=perplexity, random_state=random_state, **kwargs) ecfp_tsne_embedding = self.tsne_fit.fit_transform(self.__data) # Create a dataframe containinting the first 2 TSNE components of ECFP self.__df_2_components = pd.DataFrame(data = ecfp_tsne_embedding , columns = ['t-SNE-1', 't-SNE-2']) if len(self.__target) > 0: self.__df_2_components['target'] = self.__target return self.__df_2_components.copy()
[docs] def umap(self, n_neighbors=None, min_dist=None, pca=False, random_state=None, **kwargs): """ Calculates the first 2 UMAP components of the molecular descriptors. :param num_neighbors: Number of neighbours used in the UMAP madel. :param min_dist: Value between 0.0 and 0.99, indicates how close to each other the points can be displayed. :param random_state: random seed that can be passed as a parameter for reproducing the same results :param kwargs: Other keyword arguments are passed down to umap.UMAP :type num_neighbors: int :type min_dist: float :type random_state: int :type kwargs: key, value mappings :returns: The dataframe containing the UMAP components. :rtype: Dataframe """ self.__data = self.__data_scaler() # Preprocess the data with PCA if pca and self.__sim_type == "structural": pca = PCA(n_components=10, random_state=random_state) self.__data = pca.fit_transform(self.__data) self.__plot_title = "UMAP plot from components with cumulative variance explained " + "{:.0%}".format(sum(pca.explained_variance_ratio_)) else: self.__plot_title = "UMAP plot" if n_neighbors is None: if self.__sim_type == "structural": if pca: n_neighbors = parameters.n_neighbors_structural_pca(len(self.__data)) else: n_neighbors = parameters.n_neighbors_structural(len(self.__data)) else: n_neighbors = parameters.n_neighbors_tailored(len(self.__data)) if min_dist is None or min_dist < 0.0 or min_dist > 0.99: if min_dist is not None and (min_dist < 0.0 or min_dist > 0.99): print('min_dist must range from 0.0 up to 0.99. Default used.') if self.__sim_type == "structural": if pca: min_dist = parameters.MIN_DIST_STRUCTURAL_PCA else: min_dist = parameters.MIN_DIST_STRUCTURAL else: min_dist = parameters.MIN_DIST_TAILORED # Embed the data in two dimensions self.umap_fit = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, random_state=random_state, n_components=2, **kwargs) ecfp_umap_embedding = self.umap_fit.fit_transform(self.__data) # Create a dataframe containinting the first 2 UMAP components of ECFP self.__df_2_components = pd.DataFrame(data = ecfp_umap_embedding , columns = ['UMAP-1', 'UMAP-2']) if len(self.__target) > 0: self.__df_2_components['target'] = self.__target return self.__df_2_components.copy()
[docs] def cluster(self, n_clusters=5, **kwargs): """ Computes the clusters presents in the embedded chemical space. :param n_clusters: Number of clusters that will be computed :param kwargs: Other keyword arguments are passed down to sklearn.cluster.KMeans :type n_clusters: int :type kwargs: key, value mappings :returns: The dataframe containing the 2D embedding. :rtype: Dataframe """ if self.__df_2_components is None: print('Reduce the dimensions of your molecules before clustering.') return None x = self.__df_2_components.columns[0] y = self.__df_2_components.columns[1] cluster = KMeans(n_clusters, **kwargs) cluster.fit(self.__df_2_components[[x,y]]) self.__df_2_components['clusters'] = cluster.labels_.tolist() return self.__df_2_components.copy()
[docs] def visualize_plot(self, size=20, kind="scatter", remove_outliers=False, is_colored=True, colorbar=False, clusters=False, filename=None, title=None): """ Generates a plot for the given molecules embedded in two dimensions. :param size: Size of the plot :param kind: Type of plot :param remove_outliers: Boolean value indicating if the outliers must be identified and removed :param is_colored: Indicates if the points must be colored according to target :param colorbar: Indicates if the plot legend must be represented as a colorbar. Only considered when the target_type is "R". :param clusters: If True the clusters are shown instead of possible targets. Pass a list or a int to only show selected clusters (indexed by int). :param filename: Indicates the file where to save the plot :param title: Title of the plot. :type size: int :type kind: string :type remove_outliers: boolean :type is_colored: boolean :type colorbar: boolean :type clusters: boolean or list or int :type filename: string :type title: string :returns: The matplotlib axes containing the plot. :rtype: Axes """ if self.__df_2_components is None: print('Reduce the dimensions of your molecules before creating a plot.') return None if clusters is not False and 'clusters' not in self.__df_2_components: print('Call cluster() before visualizing a plot with clusters.') if title is None: title = self.__plot_title if kind not in self._static_plots: kind = 'scatter' print('kind indicates which type of plot must be visualized. Currently supported static visualization are:\n'+ '-scatter plot (scatter)\n'+ '-hexagon plot (hex)\n'+ '-kernel density estimation plot (kde)\n'+ 'Please input one between scatter, hex or kde for parameter kind.\n'+ 'As default scatter has been taken.') x, y, df_data = self.__parse_dataframe() # Define colors hue = None hue_order = None palette = None if clusters is not False and 'clusters' in self.__df_2_components.columns: hue = 'clusters' palette = 'deep' if not isinstance(clusters, bool): if isinstance(clusters, int): clusters = [clusters] df_data['clusters'] = df_data['clusters'].isin(clusters) # Labels cluster total = df_data['clusters'].value_counts() t_s = total.get(True) if total.get(True) else 0 p_s = t_s / total.sum() p_o = 1 - p_s labels = { True: f'Selected - {p_s:.0%}', False: f'Other - {p_o:.0%}' } df_data.clusters.replace(labels, inplace=True) hue_order = list(labels.values()) else: hue_order = self.__percentage_clusters(df_data) hue_order.sort() else: if len(self.__target) == 0: is_colored = False; else: if is_colored: df_data = df_data.assign(target=self.__target) hue = 'target' palette = 'deep' if self.__target_type == "R": palette = sns.color_palette("inferno", as_cmap=True) # Remove outliers (using Z-score) if remove_outliers: df_data = self.__remove_outliers(x, y, df_data) # Define plot aesthetics parameters sns.set_style("dark") sns.set_context("notebook", font_scale=size*0.15) fig, ax = plt.subplots(figsize=(size,size)) # Create a plot based on the reduced components if kind == "scatter": plot = sns.scatterplot(x=x, y=y, hue=hue, hue_order=hue_order, palette=palette, data=df_data, s=size*3) plot.set_label("scatter") axis = plot plot.legend(markerscale=size*0.19) # Add colorbar if self.__target_type == "R" and colorbar: plot.get_legend().remove() norm = plt.Normalize(df_data['target'].min(), df_data['target'].max()) cm = plt.cm.ScalarMappable(cmap="inferno", norm=norm) cm.set_array([]) plot.figure.colorbar(cm) elif kind == "hex": plot = ax.hexbin(df_data[x], df_data[y], gridsize=40, cmap='Blues') fig.colorbar(plot, ax=ax) ax.set_label("hex") axis = ax elif kind == "kde": plot = sns.kdeplot(x=x, y=y, shade=True, data=df_data) plot.set_label("kde") axis = plot # Remove units from axis axis.set(yticks=[]) axis.set(xticks=[]) # Add labels axis.set_title(title,fontsize=size*2) axis.set_xlabel(x,fontsize=size*2) axis.set_ylabel(y,fontsize=size*2) # Save plot if filename is not None: fig.savefig(filename) self.df_plot_xy = df_data[[x,y]] return axis
[docs] def interactive_plot(self, size=700, kind="scatter", remove_outliers=False, is_colored=True, clusters=False, filename=None, show_plot=False, title=None,): """ Generates an interactive Bokeh plot for the given molecules embedded in two dimensions. :param size: Size of the plot :param kind: Type of plot :param remove_outliers: Boolean value indicating if the outliers must be identified and removed :param is_colored: Indicates if the points must be colored according to target :param clusters: Indicates if to add a tab with the clusters if these have been computed :param filename: Indicates the file where to save the Bokeh plot :param show_plot: Immediately display the current plot. :param title: Title of the plot. :type size: int :type kind: string :type remove_outliers: boolean :type is_colored: boolean :type cluster: boolean :type filename: string :type show_plot: boolean :type title: string :returns: The bokeh figure containing the plot. :rtype: Figure """ if self.__df_2_components is None: print('Reduce the dimensions of your molecules before creating a plot.') return None if clusters and 'clusters' not in self.__df_2_components: print('Call cluster() before visualizing a plot with clusters.') if title is None: title = self.__plot_title if kind not in self._interactive_plots: kind = 'scatter' print('kind indicates which type of plot must be visualized. Currently supported interactive visualization are:\n'+ '-scatter plot (scatter)\n'+ '-hexagon plot (hex)\n'+ 'Please input one between scatter, hex or kde for parameter kind.\n'+ 'As default scatter has been taken.') x, y, df_data = self.__parse_dataframe() df_data['mols'] = self.__mols if len(self.__target) > 0: # Target exists if self.__target_type == 'C': df_data['target'] = list(map(str, self.__target)) else: df_data['target'] = self.__target # Remove outliers (using Z-score) if remove_outliers: df_data = self.__remove_outliers(x, y, df_data) tabs = None if kind == "scatter": p, tabs = self.__interactive_scatter(x, y, df_data, size, is_colored, clusters, title) else: p = self.__interactive_hex(x, y, df_data, size, title) p.xaxis[0].axis_label = x p.yaxis[0].axis_label = y p.xaxis.major_tick_line_color = None p.xaxis.minor_tick_line_color = None p.yaxis.major_tick_line_color = None p.yaxis.minor_tick_line_color = None p.xaxis.major_label_text_font_size = '0pt' p.yaxis.major_label_text_font_size = '0pt' if tabs is not None: p = tabs # Save plot if filename is not None: output_file(filename, title=title) save(p) # Show plot if show_plot: self.__open_plot(p) self.df_plot_xy = df_data[[x,y]] return p
def __data_scaler(self): # Scale the data if self.__sim_type != "structural": scaled_data = StandardScaler().fit_transform(self.__df_descriptors.values.tolist()) else: scaled_data = self.__df_descriptors.values.tolist() return scaled_data def __parse_dataframe(self): x = self.__df_2_components.columns[0] y = self.__df_2_components.columns[1] return x, y, self.__df_2_components.copy() def __remove_outliers(self, x, y, df): # Remove outliers (using Z-score) z_scores = stats.zscore(df[[x,y]]) abs_z_scores = np.abs(z_scores) filtered_entries = (abs_z_scores < 3).all(axis=1) return df[filtered_entries] def __percentage_clusters(self, df_data): total = df_data['clusters'].value_counts() sum_tot = total.sum() labels = {} count = 0 for key, value in total.items(): p = float(f"{(value/sum_tot)*100:.0f}") labels[key] = p count += p # Solve possible rounding errors if 100 - count > 0: labels[0] = labels[0] + 100 - count for key, value in labels.items(): labels[key] = f"Cluster {key} - {value:.0f}%" # Edit df_data and return labels df_data.clusters.replace(labels, inplace=True) return list(labels.values()) def __interactive_scatter(self, x, y, df_data, size, is_colored, clusters, title): # Add images column df_data['imgs'] = self.__mol_to_2Dimage(list(df_data['mols'])) df_data.drop(columns=['mols'], inplace=True) # Set tools tools = "pan, lasso_select, wheel_zoom, hover, save, reset" if len(self.__target) == 0: TOOLTIPS = parameters.TOOLTIPS_NO_TARGET else: TOOLTIPS = parameters.TOOLTIPS_TARGET # Create plot p = figure(title=title, plot_width=size, plot_height=size, tools=tools, tooltips=TOOLTIPS) if len(self.__target) == 0 or not(is_colored): p.circle(x=x, y=y, size=2.5, alpha=0.8, source=df_data) else: # Target exists if self.__target_type == 'C': index_cmap = factor_cmap('target', Category10[10], list(set(df_data['target']))) p.circle(x=x, y=y, size=2.5, alpha=0.8, line_color=index_cmap, fill_color=index_cmap, legend_group="target", source=df_data) p.legend.location = "top_left" p.legend.title = "Target" else: color_mapper = LinearColorMapper(Inferno[256], low=min(df_data['target']), high=max(df_data['target'])) index_cmap = transform('target', color_mapper) p.circle(x=x, y=y, size=2.5, alpha=0.8, line_color=index_cmap, fill_color=index_cmap, source=df_data) color_bar = ColorBar(color_mapper=color_mapper, location=(0,0)) p.add_layout(color_bar, 'right') tabs = None if clusters and 'clusters' in df_data.columns: p_c = figure(title=title, plot_width=size, plot_height=size, tools=tools, tooltips=parameters.TOOLTIPS_CLUSTER) # Get percentages self.__percentage_clusters(df_data) clusters = df_data.groupby(['clusters']) for cluster, color in zip(clusters, Category10[10]): p_c.circle(x=x, y=y, size=2.5, alpha=1, line_color=color, fill_color=color, legend_label=f'{cluster[0]}', muted_color=('#717375'), muted_alpha=0.2, source=cluster[1]) p_c.legend.location = "top_left" p_c.legend.title = "Clusters" p_c.legend.click_policy = "mute" p_c.xaxis[0].axis_label = x p_c.yaxis[0].axis_label = y p_c.xaxis.major_tick_line_color = None p_c.xaxis.minor_tick_line_color = None p_c.yaxis.major_tick_line_color = None p_c.yaxis.minor_tick_line_color = None p_c.xaxis.major_label_text_font_size = '0pt' p_c.yaxis.major_label_text_font_size = '0pt' tab1 = Panel(child=p, title="Plot") tab2 = Panel(child=p_c, title="Clusters") tabs = Tabs(tabs=[tab1, tab2]) return p, tabs def __interactive_hex(self, x, y, df_data, size, title): # Hex Plot df_data.drop(columns=['mols'], inplace=True) tools = "pan, wheel_zoom, save, reset" p = figure(title=title, plot_width=size, plot_height=size, match_aspect=True, tools=tools) p.background_fill_color = '#440154' p.grid.visible = False max_x = max(df_data[x]) min_x = min(df_data[x]) max_y = max(df_data[y]) min_y = min(df_data[y]) diff_x = max_x - min_x diff_y = max_y - min_y size = max(diff_y, diff_x) / 20 p.hexbin(df_data[x], df_data[y], size=size, hover_color="pink", hover_alpha=0.8) hover = HoverTool(tooltips=[("count", "@c")]) p.add_tools(hover) return p def __mol_to_2Dimage(self, list_mols): # Create molecule images images_mol=[] for mol in list_mols: try: png = Draw.MolToImage(mol,size=(200,130)) out = BytesIO() png.save(out, format='jpeg') png = out.getvalue() url = 'data:image/jpeg;base64,' + base64.b64encode(png).decode('utf-8') except: url = None images_mol.append(url) return images_mol @calltracker def __open_plot(self, p): show(p) def get_target(self): return self.__target