Public gist
Expires: never
Perform gene set overrepresentation analysis with PantherDB server (updated - May 2016).
gravatar
plw (Paweł Widera) - created 8 years and 4 months ago
added file: panther_bot.py
panther_bot.py
#!/usr/bin/env python3
# coding=UTF-8
"""
Run overrepresentation analysis on Panther and return the results.
Usage: panther_bot.py [options] <gene_set>
panther_bot.py -h | --help
Arguments:
<gene_set> file with gene IDs (one per line)
Options:
-d NAME, --data=NAME annotation dataset [default: bp]
"""
import time
import requests
from docopt import docopt
URL = "http://www.pantherdb.org/"
HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0",
"Referer": URL
}
DATASETS = {"p":"pathway"}
#DATASETS.update((name, "fullgo_{0}_exp".format(name)) for name in ["mf", "bp", "cc"])
DATASETS.update((name, "fullgo_{0}_comp".format(name)) for name in ["mf", "bp", "cc"])
def run(file_name, dataset):
session = requests.session()
session.headers.update(HEADERS)
session.get(URL)
time.sleep(1)
data = {
"idField": "",
"fileType": 10,
"organism": "Homo sapiens",
"dataset": "Homo sapiens",
"resultType": 3
}
with open(file_name, "rb") as genes:
files = {"fileData": (file_name, genes)}
response = session.post(URL + "geneListAnalysis.do", data=data, files=files)
time.sleep(2)
data = {
"numUploaded": 1,
"numRefList": 1,
"saveUserChoices": "false",
"type": DATASETS[dataset],
"bonferoni": 1
}
response = session.post(URL + "servlet/CompareToRefList", data=data, stream=True)
time.sleep(2)
response = session.get(URL + "tools/compareToRefListTxt.jsp", stream=True)
results = []
start = False
for line in response.iter_lines():
if not line or line.startswith("Unclassified"):
continue
if not start:
if line.startswith("GO") or line.startswith("PANTHER"):
start = True
continue
values = line.split()
term = " ".join(values[:-7])
go_id = values[-7].strip("()")
genes = values[-5]
pvalue = "{0:.5f}".format(float(values[-1]))
results.append((term, go_id, genes, pvalue))
session.close()
return results
def main():
args = docopt(__doc__)
if args["--data"] not in ["mf", "bp", "cc", "p"]:
print("Unknown dataset name. Choose one of [mf, bp, cc, p].")
return
results = run(args["<gene_set>"], args["--data"])
results.sort(key=lambda x: x[1])
for values in results:
print("\t".join(values))
if __name__ == '__main__':
main()