Jupyter notebook: memory usage for each notebook

后端 未结 2 2113
渐次进展
渐次进展 2020-12-10 05:20

The memory on my lab\'s server (Ubuntu) is constantly filling up due to users never shutting down old notebooks. I would like to get a better idea of how much memory each no

2条回答
  •  轻奢々
    轻奢々 (楼主)
    2020-12-10 05:39

    I made some improvements to sharchaea's script for portability and speed.

    Mainly, only check ports that notebooks are running on, check different hostname options, improve the kernel process check and check for ipython or jupyter.

    import argparse
    import re
    import subprocess
    
    import pandas as pd
    import psutil
    import requests
    import tabulate
    
    kernel_regex = re.compile(r".+kernel-(.+)\.json")
    notebook_regex = re.compile(r"(https?://([^:/]+):?(\d+)?)/?(\?token=([a-z0-9]+))?")
    
    
    def get_proc_info():
        pids = psutil.pids()
    
        # memory info from psutil.Process
        df_mem = []
    
        for pid in pids:
            try:
                proc = psutil.Process(pid)
                cmd = " ".join(proc.cmdline())
            except psutil.NoSuchProcess:
                continue
    
            if len(cmd) > 0 and ("jupyter" in cmd or "ipython" in cmd) and "kernel" in cmd:
                # kernel
                kernel_ID = re.sub(kernel_regex, r"\1", cmd)
    
                # memory
                mem = proc.memory_info()[0] / float(1e9)
    
                uname = proc.username()
    
                # user, pid, memory, kernel_ID
                df_mem.append([uname, pid, mem, kernel_ID])
    
        df_mem = pd.DataFrame(df_mem)
        df_mem.columns = ["user", "pid", "memory_GB", "kernel_ID"]
        return df_mem
    
    
    def get_running_notebooks():
        notebooks = []
    
        for n in subprocess.Popen(
            ["jupyter", "notebook", "list"], stdout=subprocess.PIPE
        ).stdout.readlines()[1:]:
            match = re.match(notebook_regex, n.decode())
            if match:
                base_url, host, port, _, token = match.groups()
                notebooks.append({"base_url": base_url, "token": token})
            else:
                print("Unknown format: {}".format(n.decode()))
    
        return notebooks
    
    
    def get_session_info(password=None):
        df_nb = []
        kernels = []
    
        for notebook in get_running_notebooks():
            s = requests.Session()
            if notebook["token"] is not None:
                s.get(notebook["base_url"] + "/?token=" + notebook["token"])
            else:
                # do a get to the base url to get the session cookies
                s.get(notebook["base_url"])
            if password is not None:
                # Seems jupyter auth process has changed, need to first get a cookie,
                # then add that cookie to the data being sent over with the password
                data = {"password": password}
                data.update(s.cookies)
                s.post(notebook["base_url"] + "/login", data=data)
    
            res = s.get(notebook["base_url"] + "/api/sessions")
    
            if res.status_code != 200:
                raise Exception(res.json())
    
            for sess in res.json():
                kernel_ID = sess["kernel"]["id"]
                if kernel_ID not in kernels:
                    kernel = {
                        "kernel_ID": kernel_ID,
                        "kernel_name": sess["kernel"]["name"],
                        "kernel_state": sess["kernel"]["execution_state"],
                        "kernel_connections": sess["kernel"]["connections"],
                        # "notebook_url": notebook["base_url"] + "/notebook/" + sess["id"],
                        "notebook_path": sess["path"],
                    }
                    kernel.update(notebook)
                    df_nb.append(kernel)
                    kernels.append(kernel_ID)
    
        df_nb = pd.DataFrame(df_nb)
        del df_nb["token"]
        return df_nb
    
    
    def parse_args():
        parser = argparse.ArgumentParser(description="Find memory usage.")
        parser.add_argument("--password", help="password (only needed if pass-protected)")
    
        return parser.parse_args()
    
    
    def main(password=None, print_ascii=False):
        df_mem = get_proc_info()
        df_nb = get_session_info(password)
    
        # joining tables
        df = pd.merge(df_nb, df_mem, on=["kernel_ID"], how="inner")
        df = df.sort_values("memory_GB", ascending=False).reset_index(drop=True)
        if print_ascii:
            print(tabulate.tabulate(df, headers=(df.columns.tolist())))
        return df
    
    
    if __name__ == "__main__":
        args = vars(parse_args())
        main(args["password"], print_ascii=True)
    

    I'll probably continue to make updates to this at this gist

    edit: Code has been updated to work with newer versions of Jupyter using token authentication, to leverage only psutil making it Windows compatible, and to work on Python 3.

提交回复
热议问题