Webcrawler in Go

后端 未结 2 473
梦如初夏
梦如初夏 2021-01-06 12:48

I\'m trying to build a web crawler in Go where I would like to specify the max number of concurrent workers. They will all be working as long as there are link to explore in

2条回答
  •  猫巷女王i
    2021-01-06 13:51

    I wrote a solution utilizing the mutual exclusion (Mutex) function of Go.

    When it runs on the concurrency, it may be important to restrict only one instance access the url map at a time. I believe I implemented it as written below. Please feel free to try this out. I would appreciate your feedback as I will be learn from your comments as well.

    package main
    
    import (
        "fmt"
        "sync"
    )
    
    type Fetcher interface {
        // Fetch returns the body of URL and
        // a slice of URLs found on that page.
        Fetch(url string) (body string, urls []string, err error)
    }
    
    
    
    
    // ! SafeUrlBook helps restrict only one instance access the central url map at a time. So that no redundant crawling should occur.
    type SafeUrlBook struct {
        book map[string]bool
        mux  sync.Mutex
        }
    
    func (sub *SafeUrlBook) doesThisExist(url string) bool {
        sub.mux.Lock()
        _ , key_exists := sub.book[url]
        defer sub.mux.Unlock()
        
        if key_exists {
        return true
        }  else { 
        sub.book[url] = true
        return false 
        }  
    }
    // End SafeUrlBook
    
    
    // Crawl uses fetcher to recursively crawl
    // pages starting with url, to a maximum of depth.
    // Note that now I use safeBook (SafeUrlBook) to keep track of which url has been visited by a crawler.
    func Crawl(url string, depth int, fetcher Fetcher, safeBook SafeUrlBook) {
        if depth <= 0 {
            return
        }
        
        
        exist := safeBook.doesThisExist(url)
        if exist { fmt.Println("Skip", url) ; return }
        
        
        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }
        fmt.Printf("found: %s %q\n", url, body)
        for _, u := range urls {
            Crawl(u, depth-1, fetcher, safeBook)
        }
        return
    }
    
    func main() {
        safeBook := SafeUrlBook{book: make(map[string]bool)}
        Crawl("https://golang.org/", 4, fetcher, safeBook)
    }
    
    // fakeFetcher is Fetcher that returns canned results.
    type fakeFetcher map[string]*fakeResult
    
    type fakeResult struct {
        body string
        urls []string
    }
    
    func (f fakeFetcher) Fetch(url string) (string, []string, error) {
        if res, ok := f[url]; ok {
            return res.body, res.urls, nil
        }
        return "", nil, fmt.Errorf("not found: %s", url)
    }
    
    // fetcher is a populated fakeFetcher.
    var fetcher = fakeFetcher{
        "https://golang.org/": &fakeResult{
            "The Go Programming Language",
            []string{
                "https://golang.org/pkg/",
                "https://golang.org/cmd/",
            },
        },
        "https://golang.org/pkg/": &fakeResult{
            "Packages",
            []string{
                "https://golang.org/",
                "https://golang.org/cmd/",
                "https://golang.org/pkg/fmt/",
                "https://golang.org/pkg/os/",
            },
        },
        "https://golang.org/pkg/fmt/": &fakeResult{
            "Package fmt",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
        "https://golang.org/pkg/os/": &fakeResult{
            "Package os",
            []string{
                "https://golang.org/",
                "https://golang.org/pkg/",
            },
        },
    }
    

提交回复
热议问题