import numpy as np
import pandas as pd

# get the data
url = 'https://github.com/pandas-dev/pandas/blob/main/pandas/tests/io/data/csv/iris.csv?raw=true'
df = pd.read_csv(url)

#print(df)

from sklearn.model_selection import train_test_split

x = df.iloc[:,0:4] # get the X (input features)
y = df.iloc[:,-1:] # get the Y (output targets)

# split the dataset into train set and test set
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3, random_state=32)

from sklearn.ensemble import RandomForestClassifier

# define the Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100)

# train the model with train set
rfc.fit(x_train,y_train)

# make prediction on the test set
y_pred = rfc.predict(x_test)

from sklearn import metrics
from sklearn.metrics import classification_report

# Classification Model Evaluation

# Accuracy
print("The accuracy score is:", metrics.accuracy_score(y_test,y_pred))

# Classification Report including Precision, Recall and F1-score
print("\nThe classification report is: \n")
print(classification_report(y_test, y_pred))

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 29.4/29.4 MB 48.3 MB/s eta 0:00:00

data_a = pd.read_csv("https://dp-public.oss-cn-beijing.aliyuncs.com/community/hERG.csv")
data = data_a[:1000]
print(data)

# Install necessary packages
!pip install -q rdkit-pypi
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

# convert the 'SMILES' into a list
smiles_list = data['SMILES'].tolist()

# define a for loop to convert all smiles into morgan fingerprint
morgan_fps = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        arr = np.zeros((1,))  # Initialize a NumPy array
        DataStructs.ConvertToNumpyArray(fp, arr)  # convert the computed fp to array
        morgan_fps.append(arr)  # Add arr to our morgan_fps

# get the X and Y
X = pd.DataFrame(morgan_fps) # use the Fingerprint as the input features
Y = data['pIC50']

# split the dataset into train and test
X_train,X_test,Y_train,Y_test=train_test_split(X, Y, test_size=0.2, random_state=32)

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 29.4/29.4 MB 48.3 MB/s eta 0:00:00

from sklearn.ensemble import RandomForestRegressor

# initialize the Random Forest Regressor
rfr = RandomForestRegressor(n_estimators=100, random_state=32)

# train the model
rfr.fit(X_train, Y_train)

# make prediction
Y_pred = rfr.predict(X_test)

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error

# Regression Model Evaluation
MAE = mean_absolute_error(Y_test,Y_pred) # MAE
MSE = mean_squared_error(Y_test,Y_pred) # MSE
RMSE = root_mean_squared_error(Y_test,Y_pred) # RMSE
R2 = r2_score(Y_test,Y_pred) # R Square

print("MAE = ", MAE)
print("MSE = ", MSE)
print("RMSE = ", RMSE)
print("R2 = ", R2)

from sklearn.metrics import accuracy_score

# Calculate CCR (Accuracy)
ccr = accuracy_score(y_test, y_pred)

from sklearn.metrics import precision_score

# Calculate Precision
precision = precision_score(y_test, y_pred, average='macro') # change the 'average' according to binary or multi-class classification task

from sklearn.metrics import recall_score

# Calculate Recall
recall = recall_score(y_test, y_pred, average='macro') # change the 'average' according to binary or multi-class classification task

from sklearn.metrics import classification_report

# Generate the classification report which including F1 score and other classification metrics
report = classification_report(y_test, y_pred)
print(report)

from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_error(Y_test,Y_pred)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

MSE = mean_squared_error(Y_test,Y_pred) # MSE
RMSE = root_mean_squared_error(Y_test,Y_pred) # RMSE

from sklearn.metrics import r2_score

R2 = r2_score(Y_test,Y_pred) # R Square

import pickle

# save the model
with open('new_model.pkl', 'wb') as f:
  pickle.dump(rfc,f) # the random forest model we have already trained above

# load the model
with open('new_model.pkl', 'rb') as f:
  loaded_model = pickle.load(f)

# use the loaded model to make predictions
y_pred_new = loaded_model.predict(x_test)

import tensorflow as tf
from tensorflow import keras

# prepare dataset
(train_x, train_y), (test_x, test_y) = tf.keras.datasets.mnist.load_data()

train_y = train_y[:1000]
test_y = test_y[:1000]

train_x = train_x[:1000].reshape(-1, 28 * 28) / 255.0
test_x = test_x[:1000].reshape(-1, 28 * 28) / 255.0

# build and train a simple sequential model

def create_model():
  model = tf.keras.Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=(784,)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(10)
  ])

  model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

  return model

model = create_model()
model.fit(train_x, train_y, epochs=5)

# Save the model as a `.keras` zip archive.
model.save('new_model.keras')

# Load the model
new_model = tf.keras.models.load_model('new_model.keras')

# check the loaded model
new_model.summary()

1 Types of ML System¶

2 Training QSAR Models with Python¶

2.1 Random Forest Classification Model¶

2.2 Linear Regression Model¶

3 Model Evaluation Metrics¶

3.1 Classification Metrics¶

3.1.1 Confusion Matrix¶

3.1.2 Accuracy (CCR)¶

3.1.3 Precision (PPV)¶

3.1.4 Recall (Sensitivity)¶

3.1.5 F1 Score¶

3.2 Regression Metrics¶

3.2.1 Mean Standard Error (MAE)¶

3.2.2 Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)¶

3.2.3 R Square¶

4 Save and Load the Trained Models¶

4.1 Scikit-Learn Models¶

4.2 Tensorflow Keras Models¶