#!/usr/bin/env python3 # coding=UTF-8 """ Run overrepresentation analysis on Panther and return the results. Usage: panther_bot.py [options] panther_bot.py -h | --help Arguments: file with gene IDs (one per line) Options: -d NAME, --data=NAME annotation dataset [default: bp] """ import time import requests from docopt import docopt URL = "http://www.pantherdb.org/" HEADERS = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "en-US,en;q=0.5", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0", "Referer": URL } DATASETS = {"p":"pathway"} #DATASETS.update((name, "fullgo_{0}_exp".format(name)) for name in ["mf", "bp", "cc"]) DATASETS.update((name, "fullgo_{0}_comp".format(name)) for name in ["mf", "bp", "cc"]) def run(file_name, dataset): session = requests.session() session.headers.update(HEADERS) session.get(URL) time.sleep(1) data = { "idField": "", "fileType": 10, "organism": "Homo sapiens", "dataset": "Homo sapiens", "resultType": 3 } with open(file_name, "rb") as genes: files = {"fileData": (file_name, genes)} response = session.post(URL + "geneListAnalysis.do", data=data, files=files) time.sleep(2) data = { "numUploaded": 1, "numRefList": 1, "saveUserChoices": "false", "type": DATASETS[dataset], "bonferoni": 1 } response = session.post(URL + "servlet/CompareToRefList", data=data, stream=True) time.sleep(2) response = session.get(URL + "tools/compareToRefListTxt.jsp", stream=True) results = [] start = False for line in response.iter_lines(): if not line or line.startswith("Unclassified"): continue if not start: if line.startswith("GO") or line.startswith("PANTHER"): start = True continue values = line.split() term = " ".join(values[:-7]) go_id = values[-7].strip("()") genes = values[-5] pvalue = "{0:.5f}".format(float(values[-1])) results.append((term, go_id, genes, pvalue)) session.close() return results def main(): args = docopt(__doc__) if args["--data"] not in ["mf", "bp", "cc", "p"]: print("Unknown dataset name. Choose one of [mf, bp, cc, p].") return results = run(args[""], args["--data"]) results.sort(key=lambda x: x[1]) for values in results: print("\t".join(values)) if __name__ == '__main__': main()