Overview
昨天跟Chris
讨论SVM
分类预测准确性的时候,知道PSSM_AC
的作用比PSSM
作用更明显,于是决定将以前的python
脚本改进一下,输出PSSM
和PSSM_AC
这两个文件,方便观察。该脚本包括两部分,本文将按顺序记录下来。
以前的脚本可以参考我之前的文章蛋白质序列特征提取方法之——PSSM。
1. t34pssm.py
#! /usr/bin/env python
import fileinput
import sys
from os import listdir
from os.path import isfile, join
import re
from pssm_ft import *
smplfasta = sys.argv[1]
spfasta = sys.argv[2]
check_head = re.compile(r'\>')
#read from undersample fasta, store
smplist = []
smpcnt = 0
for line, strin in enumerate(fileinput.input(smplfasta)):
if check_head.match(strin):
smplist.append(strin.strip())
smpcnt += 1
onlyfiles = [ f for f in listdir(spfasta) if isfile(join(spfasta,f)) ]
fastaDict = {}
for fi in onlyfiles:
cntnt = ''
for line, strin in enumerate(fileinput.input(spfasta+'/'+fi)):
if line == 0:
cntnt += strin.strip()
if cntnt in fastaDict:
print strin
fastaDict[cntnt] = fi
pssmdir = sys.argv[3]
finalist = []
for smp in smplist:
finalist.append(pssmdir+'/'+fastaDict[smp].split('.')[0]+'.pssm')
for fi in finalist:
pssm(fi, 'total_train_60.pssm','total_train_60.pssm_AC') #此处多了一个参数,多了一个输出文件
2. pssm_ft.py
import sys
import numpy as np
import math
import re
import fileinput
GROUP = 10
def scale(x, max, min):
"""
scale x to [-1,1]
"""
try:
return -(1-(x-min)/(max-min))+((x-min)/(max-min))
except Exception:
print "ZeroDivisionError, max-min:\s", max-min
def pssm_ac_cal(PSSM, g, l):
"""
:param PSSM: ['1','-1'...]
:param g: group
:param l: sequence length
:return PSSM_AC: g * 20 matrix
"""
PSSM_AC = np.array([ [0.0] * 20 ] * g)
for pg in range(g):
l_g = l - pg - 1
for pj in range(20):
sum_jl = 0.0
for i in range(l):
sum_jl += PSSM[i][pj]
sum_jl /= l
pssm_acjg = 0.0
for i in range(l_g):
pssm_acjg += (PSSM[i][pj]-sum_jl) * (PSSM[i+pg+1][pj]-sum_jl)
pssm_acjg /= l_g
PSSM_AC[pg][pj] = pssm_acjg
return PSSM_AC
def pssm(fi,output,output_AC):
# 0-19 represents amino acid 'ARNDCQEGHILKMFPSTWYV'
Amino_vec = "ARNDCQEGHILKMFPSTWYV"
PSSM = []
ACC = [ [0.0] * 20 ] * 20
seq_cn = 0
for line, strin in enumerate(fileinput.input(fi)):
if line > 2:
str_vec = strin.split()[1:22]
if len(str_vec) == 0:
break
PSSM.append(map(int, str_vec[1:]))
seq_cn += 1
#test1_1 += float(str_vec[1]) test1_1 = -103
ACC[Amino_vec.index(str_vec[0])] = map(sum, zip(map(int, str_vec[1:]),
ACC[Amino_vec.index(str_vec[0])]))
fileinput.close()
# section for ACC features
ACC_np = np.array(ACC)
ACC_np = np.divide(ACC_np, seq_cn)
amcnt_max = np.amax(ACC_np)
amcnt_min = np.amin(ACC_np)
vfunc = np.vectorize(scale)
ACC_np = vfunc(ACC_np, amcnt_max,amcnt_min)
ACC_shp = np.shape(ACC_np)
#section for PSSM_AC features
PSSM = np.array(PSSM)
PSSM_AC = pssm_ac_cal(PSSM, GROUP, seq_cn)
PSSM_shp = np.shape(PSSM_AC)
file_out = file(output,'a')
file_out_AC = file(output_AC,'a')
#此处将保存文件的代码拆分开
np.savetxt(file_out, [np.reshape(ACC_np, (ACC_shp[0] * ACC_shp[1], ))], delimiter=",")
np.savetxt(file_out_AC, [np.reshape(PSSM_AC, (PSSM_shp[0] * PSSM_shp[1], ))], delimiter=",")