组织分类模型的构建¶
1、准备工作¶
#安装scikit-learn
pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple -U scikit-learn pandas matplotlib
#检查安装版本
python -c "import sklearn; sklearn.show_versions()"
2、模型构建¶
a、下载数据集和脚本放在同级目录
b、打开终端(见下图),输入python classification.py即可。也可以输入python进入交互界面,将代码粘贴到对话框执行。 
#以下代码95%以上都为chatGPT生成
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import pickle
# 自定义函数
## PCA聚类并可视化
def plot_PCA(X, y, title):
pca = PCA(n_components=2) # 将数据降至 2 维
X_pca = pca.fit_transform(X)
plt.figure(figsize=(8, 6))
for label in np.unique(y):
plt.scatter(X_pca[y == label, 0], X_pca[y == label, 1], label=label_encoder.inverse_transform([label])[0])
plt.title(title)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()
## LDA聚类并可视化
def plot_LDA(X, y, title):
lda = LinearDiscriminantAnalysis(n_components=2) # 选择两个主要成分用于可视化
X_lda = lda.fit_transform(X, y)
plt.figure(figsize=(8, 6))
for label in np.unique(y):
plt.scatter(X_lda[y == label, 0], X_lda[y == label, 1], label=label_encoder.inverse_transform([label])[0])
plt.title(title)
plt.xlabel('LDA Component 1')
plt.ylabel('LDA Component 2')
plt.legend()
plt.show()
# 1. 读取数据
data = pd.read_csv('dataset.txt', header=0, sep='\t')
# 2. 将最后一列作为标签,其余列作为特征
X = data.iloc[:, 1:-1].values # 特征
y = data.iloc[:, -1].values # 标签(最后一列)
#将标签转换为数值("blood" -> 0, "liver" -> 1, "muscle" -> 2, "small intestine" -> 2)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("Label mapping:", label_encoder.classes_)
# 3. 对特征进行标准化
X_log = np.log1p(X) #转录组数据常用的一种标准化方法log(x+1)
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X_log) # 标准化后的特征
# 4. 数据初步可视化
plot_PCA(X_log, y_encoded, 'PCA of the dataset')
plot_LDA(X_log, y_encoded, 'LDA of the dataset')
# 5. 将数据分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_log, y_encoded, test_size=0.2, random_state=42)
# 6. 选择分类模型(例如随机森林分类器)
# clf = RandomForestClassifier(random_state=42)
clf = LogisticRegression()
# 7. 训练模型
clf.fit(X_train, y_train)
# 8. 在测试集上进行预测
y_pred = clf.predict(X_test)
# 9. 评估模型准确性
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
# 10. 保存模型
with open('tissue_clf.pkl', 'wb') as f:
pickle.dump(clf, f)
# 11. 加载模型
with open('tissue_clf.pkl', 'rb') as f:
clf2 = pickle.load(f)
# 12. 使用加载的模型进行预测
y_pred_loaded = clf2.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_loaded)
print(f'Accuracy: {accuracy * 100:.2f}%')
3、python环境备选方案¶
a、在MobaXterm中安装python (未成功)
apt install python3
pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple -U scikit-learn pandas matplotlib
ssh fat01
cd /data2/dongkeai/group2/105042024802
mkdir ai_work
cd ai_work
cp /data2/dongkeai/00Data/AI/* ./
python classification.py
本站总访问量 次
Authors: