Overview

最近几天,Chris和我看了很多论文,对PSSM有了更深的认识。但是,鉴于PSSM本身包含单个位置的信息更明显,而几乎没有包含蛋白质序列片段信息,我们两人思考如何将蛋白质序列片段信息编码,终于找到了一种PSSM的处理方式,这种方式叫做smoothed window,特此记录一下。
该算法原理,请参考这篇论文:Predicting RNA-binding sites of proteins using support vector machines and evolutionary information,并在此感谢该论文作者!并感谢Chris对我的鼓励和帮助!

1 python编码

1.1 t34pssm.py

这部分代码是主函数,是之前这篇文章中的代码,可以参考这里蛋白质序列特征提取方法之——PSSM,就不详加解释了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
"""
    python t34pssm.py total_train_60.fasta ./total_train_60 ./total_train_60_pssm  w_smth n
    param:
        1. 总的fasta格式蛋白质序列
        2. 分开的fasta格式蛋白质序列的文件夹
        3. 分开的fasta格式蛋白质序列对应的pssm的文件夹 
        4.smooth-window 值,要求是奇数。
        5.截取的序列长度,一般为25,30,50。本例为30.
"""
import fileinput
import sys
from os import listdir
from os.path import isfile, join
import re
from pssm_smoothed import *
 
smplfasta = sys.argv[1
spfasta = sys.argv[2]  
check_head = re.compile(r'\>'
 
#read from undersample fasta, store
smplist = []
smpcnt = 0
for line, strin in enumerate(fileinput.input(smplfasta)):
    if check_head.match(strin):
        smplist.append(strin.strip())
        smpcnt += 1
onlyfiles = [ f for f in listdir(spfasta) if isfile(join(spfasta,f)) ]
 
fastaDict = {}
 
for fi in onlyfiles:
    cntnt = ''
    for line, strin in enumerate(fileinput.input(spfasta+'/'+fi)):
        if line == 0:
            cntnt += strin.strip()
    if cntnt in fastaDict:
        print strin
    fastaDict[cntnt] = fi
 
pssmdir = sys.argv[3]
 
w_smth = sys.argv[4]
#如果窗口值不是奇数,退出程序
if int(w_smth)%2 ==0:
    print 'Please change your input argument ' + w_smth + ' to an odd smoothing-window number!!!'
    sys.exit(1)
n=sys.argv[5]
 
for smp in smplist:
    finalist.append(pssmdir+'/'+fastaDict[smp].split('.')[0]+'.pssm')
 
for fi in finalist:
    pssm_single(fi,'total_train_60_pssm_smth',w_smth,n)

1.2 pssm_smoothed.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
"""
Retrieve smoothed PSSM features
"""
import sys
import numpy as np
import math
import re
import fileinput
def pssm_single(fi,output_smth,w_smth,n):
    # 0-19 represents amino acid 'ARNDCQEGHILKMFPSTWYV'
    w_smth=int(w_smth)
    n=int(n)
    Amino_vec = "ARNDCQEGHILKMFPSTWYV"
 
    PSSM = []
    PSSM_orig = [] 
    seq_cn = 0
    # 读取pssm文件
    for line, strin in enumerate(fileinput.input(fi)):
        if line > 2:
        str_vec = strin.split()[1:22]
        if len(str_vec) == 0:
            break
        PSSM.append(map(int, str_vec[1:]))
        seq_cn += 1
    print seq_cn           
    fileinput.close()
    PSSM_smth = np.array([[0.0]*20]*seq_cn)
    #print PSSM_smth
    PSSM_orig=np.array(PSSM)
    #print PSSM_orig
    #section for PSSM_smth features
    PSSM_smth_full=pssm_smth(PSSM_orig,PSSM_smth,w_smth,seq_cn)
    PSSM_smth_final=[[0.0]*20]*n
 
    #截取PSSM_smth_full矩阵的前n行,作为输出内容
    for i in range(n):
        PSSM_smth_final[i]=PSSM_smth_full[i]
        #print PSSM_smth_final[i]
    PSSM_smth_final_shp=np.shape(PSSM_smth_final)
    # for i in range(seq_cn):
    #   print PSSM_smth_final[i]
    file_out_smth=file(output_smth,'a')
    np.savetxt(file_out_smth, [np.reshape(PSSM_smth_final, (PSSM_smth_final_shp[0] * PSSM_smth_final_shp[1], ))], delimiter=",")
 
#这个函数会求出整条序列的smoothed pssm矩阵
def pssm_smth(PSSM_orig,PSSM_smth,w_smth,l):
    for i in range(l):
        #smooth窗口超过pssm上边界
        if i <(w_smth-1)/2:
            for j in range(i+(w_smth-1)/2+1):
                #print i,j             
                PSSM_smth[i]+=PSSM_orig[j]
                #print PSSM_smth[i]
        #smooth窗口超过pssm下边界
        elif i>=(l-(w_smth-1)/2):
            for j in range(i-(w_smth-1)/2,l):  
                #print i,j         
                PSSM_smth[i]+=PSSM_orig[j]
                #print PSSM_smth[i]
        else:
            for j in range(i-(w_smth-1)/2,i+(w_smth-1)/2+1):
                #print i,j
                PSSM_smth[i]+=PSSM_orig[j]
                #print PSSM_smth[i]    
    return PSSM_smth

1.3 总结

该算法以行为单位进行运算。以本行为中心,上下扩展,扩展的上下长度为smooth窗口值,将这些行的值加起来,存入新的矩阵的相同行位置。这样新的矩阵就包含了多个连续氨基酸序列片段的信息,会为特征提取提供新的思路。