memoize to disk - python - persistent memoization

前端 未结 9 885
再見小時候
再見小時候 2020-12-24 05:34

Is there a way to memoize the output of a function to disk?

I have a function

def getHtmlOfUrl(url):
    ... # expensive computation
<
相关标签:
9条回答
  • 2020-12-24 06:21

    Check out joblib.Memory. It's a library for doing exactly that.

    0 讨论(0)
  • 2020-12-24 06:22

    Assuming that you data is json serializable, this code should work

    import os, json
    
    def json_file(fname):
        def decorator(function):
            def wrapper(*args, **kwargs):
                if os.path.isfile(fname):
                    with open(fname, 'r') as f:
                        ret = json.load(f)
                else:
                    with open(fname, 'w') as f:
                        ret = function(*args, **kwargs)
                        json.dump(ret, f)
                return ret
            return wrapper
        return decorator
    

    decorate getHtmlOfUrl and then simply call it, if it had been run previously, you will get your cached data.

    Checked with python 2.x and python 3.x

    0 讨论(0)
  • 2020-12-24 06:29

    Most answers are in a decorator fashion. But maybe I don't want to cache the result every time when calling the function.

    I made one solution using context manager, so the function can be called as

    with DiskCacher('cache_id', myfunc) as myfunc2:
        res=myfunc2(...)
    

    when you need the caching functionality.

    The 'cache_id' string is used to distinguish data files, which are named [calling_script]_[cache_id].dat. So if you are doing this in a loop, will need to incorporate the looping variable into this cache_id, otherwise data will be overwritten.

    Alternatively:

    myfunc2=DiskCacher('cache_id')(myfunc)
    res=myfunc2(...)
    

    Alternatively (this is probably not quite useful as the same id is used all time time):

    @DiskCacher('cache_id')
    def myfunc(*args):
        ...
    

    The complete code with examples (I'm using pickle to save/load, but can be changed to whatever save/read methods. NOTE that this is also assuming the function in question returns only 1 return value):

    from __future__ import print_function
    import sys, os
    import functools
    
    def formFilename(folder, varid):
        '''Compose abspath for cache file
    
        Args:
            folder (str): cache folder path.
            varid (str): variable id to form file name and used as variable id.
        Returns:
            abpath (str): abspath for cache file, which is using the <folder>
                as folder. The file name is the format:
                    [script_file]_[varid].dat
        '''
        script_file=os.path.splitext(sys.argv[0])[0]
        name='[%s]_[%s].nc' %(script_file, varid)
        abpath=os.path.join(folder, name)
    
        return abpath
    
    
    def readCache(folder, varid, verbose=True):
        '''Read cached data
    
        Args:
            folder (str): cache folder path.
            varid (str): variable id.
        Keyword Args:
            verbose (bool): whether to print some text info.
        Returns:
            results (tuple): a tuple containing data read in from cached file(s).
        '''
        import pickle
        abpath_in=formFilename(folder, varid)
        if os.path.exists(abpath_in):
            if verbose:
                print('\n# <readCache>: Read in variable', varid,
                        'from disk cache:\n', abpath_in)
            with open(abpath_in, 'rb') as fin:
                results=pickle.load(fin)
    
        return results
    
    
    def writeCache(results, folder, varid, verbose=True):
        '''Write data to disk cache
    
        Args:
            results (tuple): a tuple containing data read to cache.
            folder (str): cache folder path.
            varid (str): variable id.
        Keyword Args:
            verbose (bool): whether to print some text info.
        '''
        import pickle
        abpath_out=formFilename(folder, varid)
        if verbose:
            print('\n# <writeCache>: Saving output to:\n',abpath_out)
        with open(abpath_out, 'wb') as fout:
            pickle.dump(results, fout)
    
        return
    
    
    class DiskCacher(object):
        def __init__(self, varid, func=None, folder=None, overwrite=False,
                verbose=True):
            '''Disk cache context manager
    
            Args:
                varid (str): string id used to save cache.
                    function <func> is assumed to return only 1 return value.
            Keyword Args:
                func (callable): function object whose return values are to be
                    cached.
                folder (str or None): cache folder path. If None, use a default.
                overwrite (bool): whether to force a new computation or not.
                verbose (bool): whether to print some text info.
            '''
    
            if folder is None:
                self.folder='/tmp/cache/'
            else:
                self.folder=folder
    
            self.func=func
            self.varid=varid
            self.overwrite=overwrite
            self.verbose=verbose
    
        def __enter__(self):
            if self.func is None:
                raise Exception("Need to provide a callable function to __init__() when used as context manager.")
    
            return _Cache2Disk(self.func, self.varid, self.folder,
                    self.overwrite, self.verbose)
    
        def __exit__(self, type, value, traceback):
            return
    
        def __call__(self, func=None):
            _func=func or self.func
            return _Cache2Disk(_func, self.varid, self.folder, self.overwrite,
                    self.verbose)
    
    
    
    def _Cache2Disk(func, varid, folder, overwrite, verbose):
        '''Inner decorator function
    
        Args:
            func (callable): function object whose return values are to be
                cached.
            varid (str): variable id.
            folder (str): cache folder path.
            overwrite (bool): whether to force a new computation or not.
            verbose (bool): whether to print some text info.
        Returns:
            decorated function: if cache exists, the function is <readCache>
                which will read cached data from disk. If needs to recompute,
                the function is wrapped that the return values are saved to disk
                before returning.
        '''
    
        def decorator_func(func):
            abpath_in=formFilename(folder, varid)
    
            @functools.wraps(func)
            def wrapper(*args, **kwargs):
                if os.path.exists(abpath_in) and not overwrite:
                    results=readCache(folder, varid, verbose)
                else:
                    results=func(*args, **kwargs)
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    writeCache(results, folder, varid, verbose)
                return results
            return wrapper
    
        return decorator_func(func)
    
    
    
    if __name__=='__main__':
    
        data=range(10)  # dummy data
    
        #--------------Use as context manager--------------
        def func1(data, n):
            '''dummy function'''
            results=[i*n for i in data]
            return results
    
        print('\n### Context manager, 1st time call')
        with DiskCacher('context_mananger', func1) as func1b:
            res=func1b(data, 10)
            print('res =', res)
    
        print('\n### Context manager, 2nd time call')
        with DiskCacher('context_mananger', func1) as func1b:
            res=func1b(data, 10)
            print('res =', res)
    
        print('\n### Context manager, 3rd time call with overwrite=True')
        with DiskCacher('context_mananger', func1, overwrite=True) as func1b:
            res=func1b(data, 10)
            print('res =', res)
    
        #--------------Return a new function--------------
        def func2(data, n):
            results=[i*n for i in data]
            return results
    
        print('\n### Wrap a new function, 1st time call')
        func2b=DiskCacher('new_func')(func2)
        res=func2b(data, 10)
        print('res =', res)
    
        print('\n### Wrap a new function, 2nd time call')
        res=func2b(data, 10)
        print('res =', res)
    
        #----Decorate a function using the syntax sugar----
        @DiskCacher('pie_dec')
        def func3(data, n):
            results=[i*n for i in data]
            return results
    
        print('\n### pie decorator, 1st time call')
        res=func3(data, 10)
        print('res =', res)
    
        print('\n### pie decorator, 2nd time call.')
        res=func3(data, 10)
        print('res =', res)
    
    

    The outputs:

    ### Context manager, 1st time call
    
    # <writeCache>: Saving output to:
     /tmp/cache/[diskcache]_[context_mananger].nc
    res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    
    ### Context manager, 2nd time call
    
    # <readCache>: Read in variable context_mananger from disk cache:
     /tmp/cache/[diskcache]_[context_mananger].nc
    res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    
    ### Context manager, 3rd time call with overwrite=True
    
    # <writeCache>: Saving output to:
     /tmp/cache/[diskcache]_[context_mananger].nc
    res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    
    ### Wrap a new function, 1st time call
    
    # <writeCache>: Saving output to:
     /tmp/cache/[diskcache]_[new_func].nc
    res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    
    ### Wrap a new function, 2nd time call
    
    # <readCache>: Read in variable new_func from disk cache:
     /tmp/cache/[diskcache]_[new_func].nc
    res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    
    ### pie decorator, 1st time call
    
    # <writeCache>: Saving output to:
     /tmp/cache/[diskcache]_[pie_dec].nc
    res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    
    ### pie decorator, 2nd time call.
    
    # <readCache>: Read in variable pie_dec from disk cache:
     /tmp/cache/[diskcache]_[pie_dec].nc
    res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    
    0 讨论(0)
提交回复
热议问题