
# script for findeing PSSMs in genetic sequences using matchPWM. 
# input: datafile with sequences (positive or negative sequences) 

rm(list = ls())


library(Biostrings) 
library(MotifDb)

TFBS <- read.table("input.txt", header=TRUE, sep="\t")


for (i in 1:length(TFBS$tf)) {  
  traf <- as.character(TFBS$tf[i])
  #traf  
  res <- query (query (query(MotifDb, "hsapiens"), traf), "database")
  if (length(res) == "0") {next}
  for (j in 1:length(res)) {
    pfm <- res[[j]]
    pcm <- round(100*pfm)
    sequence <- as.character(TFBS$sequence[i]) 

    hits <- matchPWM(pcm, sequence,"20%",with.score=TRUE)
    rev_hits <- matchPWM(reverseComplement(pcm), sequence, "20%", with.score=TRUE)
    
 	if (length(start(hits)) != "0" ){
	hit_scores <- mcols(hits)$score/maxScore(pcm)
	forw <- rbind(data.frame(index = TFBS$index[i], tf=traf, matrix=j, start=start(hits), end=end(hits), score=hit_scores, strand="+"))
	}

	if (length(start(rev_hits)) != "0" ){
	rev_scores <- mcols(rev_hits)$score/maxScore(pcm)
  	rev <- rbind(data.frame(index = TFBS$index[i], tf=traf, matrix=j, start=start(rev_hits), end=end(rev_hits), score=rev_scores, strand="-"))
	}

	if ( length(start(hits)) == "0") {
		forw <- rbind(data.frame(index = TFBS$index[i], tf=traf, matrix=j, start="0", end="0", score="0", strand="+"))
	  
	}
	if ( length(start(rev_hits)) == "0") {
		rev <- rbind(data.frame(index = TFBS$index[i], tf=traf, matrix=j, start="0", end="0", score="0", strand="-"))
	  
	}   
    cn <- T
    if (i>1) cn <- F
    write.table(forw, file="output.txt", append=TRUE, sep="\t", row.names=F, col.names=cn)
    write.table(rev, file="output.txt", append=TRUE, sep="\t", row.names=F, col.names=F)
  }
 # rm(traf, pfm, pcm, res, seq, hits, df, j)
}
