awk to find overlaps

只谈情不闲聊 提交于 2019-12-02 11:12:16

Using Gnu Awk version 4, you could try:

gawk -f a.awk file file

where a.awk is:

NR==FNR {
    if (FNR>1) {
        a[$1][++i]=$2
        b[$1][i]=$3
    }
    next
}
FNR==1 {
    fmt="%-7s%-10s%-10s%-10s%-10s\n"
    printf fmt,"Group","Start","End","NewStart","NewEnd" 
}
FNR>1{
    $4=$2; $5=$3
    n=checkInside($1,$2,$3)
    if (n>0) {
        ff=0; x=$2; y=$3
        for (i=1; i<=n; i++) {
            ar=a[$1][R[i]]; br=b[$1][R[i]];
            getIntersect($2,$3,ar,br)
            getLargest($2,$3,ar,br)
            ovl=((i2-i1)/($3-$2))*100;
            ovr=((i2-i1)/(br-ar))*100;
            if (ovl>50 && ovr>50) {
                if (r1<x) x=r1
                if (r2>y) y=r2
                ff=1
            }
        }
        if (ff) {
            $4=x; $5=y
        }
    }
    printf fmt,$1,$2,$3,$4,$5
}

function getLargest(x1,y1,x2,y2) {
    r1=(x1<=x2)?x1:x2
    r2=(y1>=y2)?y1:y2
}

function getIntersect(x1,y1,x2,y2) {
    if (x1>=x2 && x1<=y2) {
        i1=x1;
    } else {
        i1=x2;
    }
    i2=(y1<=y2)?y1:y2
}

function checkInside(g,x,y,i,j,x1,y1) {
    R["x"]=0
    for (i in a[g]) {
        x1=a[g][i]; y1=b[g][i];
        if ((x>=x1 && x<=y1) || (y>=x1 && y<=y1)) {
            if (!(x==x1 && y==y1))
                R[++j]=i
        }
    }
    return j
}

Output:

Group  Start     End       NewStart  NewEnd    
chr1   117132092 118875009 117027758 119458215 
chr1   117027758 119458215 117027758 119458215 
chr1   103756473 104864582 103354114 104864582 
chr1   105093795 106219211 105093795 106219211 
chr1   103354114 104747251 102741437 105235140 
chr1   102741437 105235140 102741437 105235140 
chr1   100090254 101094139 100090254 101614730 
chr1   100426977 101614730 100090254 101614730 
chr2   86644663  87767193  86644663  87767193  
chr2   82473711  83636545  82473711  83636545  
chr2   83896702  85079032  83876122  85091910  
chr2   83876122  85091910  83876122  85091910  
chr2   82943211  84350917  82943211  84350917  
chr3   89410051  90485635  89405753  90485635  
chr3   89405753  90485635  89405753  90485635  
chr3   86491492  87593215  86491492  87593215  
chr3   82507157  83738004  82507157  83738004  
chr3   85059618  86362254  85059618  86362254  
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!