Overview
最近几天,Chris
和我看了很多论文,对PSSM
有了更深的认识。但是,鉴于PSSM
本身包含单个位置的信息更明显,而几乎没有包含蛋白质序列片段信息,我们两人思考如何将蛋白质序列片段信息编码,终于找到了一种PSSM
的处理方式,这种方式叫做smoothed window
,特此记录一下。
该算法原理,请参考这篇论文:Predicting RNA-binding sites of proteins using support vector machines and evolutionary information,并在此感谢该论文作者!并感谢Chris
对我的鼓励和帮助!
1 python编码
1.1 t34pssm.py
这部分代码是主函数,是之前这篇文章中的代码,可以参考这里蛋白质序列特征提取方法之——PSSM,就不详加解释了。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | #! /usr/bin/env python # -*- coding: utf-8 -*- # vim:fenc=utf-8 """ python t34pssm.py total_train_60.fasta ./total_train_60 ./total_train_60_pssm w_smth n param: 1. 总的fasta格式蛋白质序列 2. 分开的fasta格式蛋白质序列的文件夹 3. 分开的fasta格式蛋白质序列对应的pssm的文件夹 4.smooth-window 值,要求是奇数。 5.截取的序列长度,一般为25,30,50。本例为30. """ import fileinput import sys from os import listdir from os.path import isfile, join import re from pssm_smoothed import * smplfasta = sys.argv[ 1 ] spfasta = sys.argv[ 2 ] check_head = re. compile (r '\>' ) #read from undersample fasta, store smplist = [] smpcnt = 0 for line, strin in enumerate (fileinput. input (smplfasta)): if check_head.match(strin): smplist.append(strin.strip()) smpcnt + = 1 onlyfiles = [ f for f in listdir(spfasta) if isfile(join(spfasta,f)) ] fastaDict = {} for fi in onlyfiles: cntnt = '' for line, strin in enumerate (fileinput. input (spfasta + '/' + fi)): if line = = 0 : cntnt + = strin.strip() if cntnt in fastaDict: print strin fastaDict[cntnt] = fi pssmdir = sys.argv[ 3 ] w_smth = sys.argv[ 4 ] #如果窗口值不是奇数,退出程序 if int (w_smth) % 2 = = 0 : print 'Please change your input argument ' + w_smth + ' to an odd smoothing-window number!!!' sys.exit( 1 ) n = sys.argv[ 5 ] for smp in smplist: finalist.append(pssmdir + '/' + fastaDict[smp].split( '.' )[ 0 ] + '.pssm' ) for fi in finalist: pssm_single(fi, 'total_train_60_pssm_smth' ,w_smth,n) |
1.2 pssm_smoothed.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | #! /usr/bin/env python # -*- coding: utf-8 -*- # vim:fenc=utf-8 """ Retrieve smoothed PSSM features """ import sys import numpy as np import math import re import fileinput def pssm_single(fi,output_smth,w_smth,n): # 0-19 represents amino acid 'ARNDCQEGHILKMFPSTWYV' w_smth = int (w_smth) n = int (n) Amino_vec = "ARNDCQEGHILKMFPSTWYV" PSSM = [] PSSM_orig = [] seq_cn = 0 # 读取pssm文件 for line, strin in enumerate (fileinput. input (fi)): if line > 2 : str_vec = strin.split()[ 1 : 22 ] if len (str_vec) = = 0 : break PSSM.append( map ( int , str_vec[ 1 :])) seq_cn + = 1 print seq_cn fileinput.close() PSSM_smth = np.array([[ 0.0 ] * 20 ] * seq_cn) #print PSSM_smth PSSM_orig = np.array(PSSM) #print PSSM_orig #section for PSSM_smth features PSSM_smth_full = pssm_smth(PSSM_orig,PSSM_smth,w_smth,seq_cn) PSSM_smth_final = [[ 0.0 ] * 20 ] * n #截取PSSM_smth_full矩阵的前n行,作为输出内容 for i in range (n): PSSM_smth_final[i] = PSSM_smth_full[i] #print PSSM_smth_final[i] PSSM_smth_final_shp = np.shape(PSSM_smth_final) # for i in range(seq_cn): # print PSSM_smth_final[i] file_out_smth = file (output_smth, 'a' ) np.savetxt(file_out_smth, [np.reshape(PSSM_smth_final, (PSSM_smth_final_shp[ 0 ] * PSSM_smth_final_shp[ 1 ], ))], delimiter = "," ) #这个函数会求出整条序列的smoothed pssm矩阵 def pssm_smth(PSSM_orig,PSSM_smth,w_smth,l): for i in range (l): #smooth窗口超过pssm上边界 if i <(w_smth - 1 ) / 2 : for j in range (i + (w_smth - 1 ) / 2 + 1 ): #print i,j PSSM_smth[i] + = PSSM_orig[j] #print PSSM_smth[i] #smooth窗口超过pssm下边界 elif i> = (l - (w_smth - 1 ) / 2 ): for j in range (i - (w_smth - 1 ) / 2 ,l): #print i,j PSSM_smth[i] + = PSSM_orig[j] #print PSSM_smth[i] else : for j in range (i - (w_smth - 1 ) / 2 ,i + (w_smth - 1 ) / 2 + 1 ): #print i,j PSSM_smth[i] + = PSSM_orig[j] #print PSSM_smth[i] return PSSM_smth |
1.3 总结
该算法以行为单位进行运算。以本行为中心,上下扩展,扩展的上下长度为smooth
窗口值,将这些行的值加起来,存入新的矩阵的相同行位置。这样新的矩阵就包含了多个连续氨基酸序列片段的信息,会为特征提取提供新的思路。