Understanding correct use of channels in golang concurrent context

问题

I am writing a go project which is a simple web crawler to crawl links on the website. I want to experiment the concurrent features such as goroutines and channels. But when I run it it didn't go through. Nothing is showed as if there is nothing happening. I have no idea what went wrong. Can somebody point it out for me?

It works and shows all the crawled links if I remove the channels logic but I want it to send the links into a buffered channel and then display the links before ending the program. The program is supposed to be able to go to any depth as specified in the program. Currently the depth is 1.

package main

import (
    "fmt"
    "log"
    "net/http"
    "os"
    "strings"
    "time"

    "golang.org/x/net/html"
)

// Link type to be sent over channel
type Link struct {
    URL string
    ok  bool
}

func main() {
    if len(os.Args) != 2 {
        fmt.Println("Usage: crawl [URL].")
    }

    url := os.Args[1]
    if !strings.HasPrefix(url, "http://") {
        url = "http://" + url
    }

    ch := make(chan *Link, 5)
    crawl(url, 1, ch)

    visited := make(map[string]bool)

    time.Sleep(2 * time.Second)

    for link := range ch {
        if _, ok := visited[link.URL]; !ok {
            visited[link.URL] = true
        }
    }

    close(ch)
    for l := range visited {
        fmt.Println(l)
    }
}

func crawl(url string, n int, ch chan *Link) {
    if n < 1 {
        return
    }
    resp, err := http.Get(url)
    if err != nil {
        log.Fatalf("Can not reach the site. Error = %v\n", err)
        os.Exit(1)
    }

    b := resp.Body
    defer b.Close()

    z := html.NewTokenizer(b)

    nextN := n - 1
    for {
        token := z.Next()

        switch token {
        case html.ErrorToken:
            return
        case html.StartTagToken:
            current := z.Token()
            if current.Data != "a" {
                continue
            }
            result, ok := getHrefTag(current)
            if !ok {
                continue
            }

            hasProto := strings.HasPrefix(result, "http")
            if hasProto {
                go crawl(result, nextN, ch)
                ch <- &Link{result, true}
            }
        }
    }

}

func getHrefTag(token html.Token) (result string, ok bool) {
    for _, a := range token.Attr {
        if a.Key == "href" {
            result = a.Val
            ok = true
            break
        }
    }
    return
}

UPDATED:

After some fiddling I figured out to change the code to remove the data races, however I still don't know how to avoid crawling urls that were visited previously (maybe I should start another question?):

package main

import (
    "fmt"
    "log"
    "net/http"
    "os"
    "strings"

    "golang.org/x/net/html"
)

func main() {
    if len(os.Args) != 2 {
        fmt.Println("Usage: crawl [URL].")
    }

    url := os.Args[1]
    if !strings.HasPrefix(url, "http://") {
        url = "http://" + url
    }

    for link := range newCrawl(url, 1) {
        fmt.Println(link)
    }
}

func newCrawl(url string, num int) chan string {
    ch := make(chan string, 20)

    go func() {
        crawl(url, 1, ch)
        close(ch)
    }()

    return ch
}

func crawl(url string, n int, ch chan string) {
    if n < 1 {
        return
    }
    resp, err := http.Get(url)
    if err != nil {
        log.Fatalf("Can not reach the site. Error = %v\n", err)
        os.Exit(1)
    }

    b := resp.Body
    defer b.Close()

    z := html.NewTokenizer(b)

    nextN := n - 1
    for {
        token := z.Next()

        switch token {
        case html.ErrorToken:
            return
        case html.StartTagToken:
            current := z.Token()
            if current.Data != "a" {
                continue
            }
            result, ok := getHrefTag(current)
            if !ok {
                continue
            }

            hasProto := strings.HasPrefix(result, "http")
            if hasProto {
                done := make(chan struct{})
                go func() {
                    crawl(result, nextN, ch)
                    close(done)
                }()
                <-done
                ch <- result
            }
        }
    }
}

func getHrefTag(token html.Token) (result string, ok bool) {
    for _, a := range token.Attr {
        if a.Key == "href" {
            result = a.Val
            ok = true
            break
        }
    }
    return
}

回答1:

I think that recursive calling of goroutines is not good idea. It can simply goes out of control.. I would prefer more flat model like this:

package main

import (
    "fmt"
    "log"
    "net/http"
    "os"
    "strings"
    "sync"

    "golang.org/x/net/html"
)

func main() {

    if len(os.Args) != 2 {
        fmt.Println("Usage: crawl [URL].")
    }

    url := os.Args[1]
    if !strings.HasPrefix(url, "http://") {
        url = "http://" + url
    }

    wg := NewWorkGroup(1)
    wg.Crawl(url)
    for k, v := range wg.urlMap {
        fmt.Printf("%s: %d\n", k, v)
    }
}

// represents single link and its deph
type Link struct {
    url  string
    deph uint32
}

// wraps all around to group
type WorkGroup struct {
    *sync.WaitGroup
    maxDeph uint32
    numW    int
    pool    chan *Worker
    linkQ   chan Link
    urlMap  map[string]uint32
}

type Worker struct {
    result chan []Link
}

func newWorker() *Worker {
    return &Worker{
        result: make(chan []Link),
    }
}

func NewWorkGroup(maxDeph uint32) *WorkGroup {
    numW := int(maxDeph)
    if maxDeph > 10 {
        numW = 10
    }
    return &WorkGroup{
        WaitGroup: new(sync.WaitGroup),
        maxDeph:   maxDeph,
        numW:      numW,
        pool:      make(chan *Worker, numW),
        linkQ:     make(chan Link, 100),
        urlMap:    make(map[string]uint32),
    }
}

// dispatch workers -> filter visited -> send not visited to channel
// pool + dispatcher keep order so workers go level by level
func (wg *WorkGroup) spawnDispatcher() {
    wg.Add(1)
    go func() {
        defer wg.Done()
        defer close(wg.linkQ)

        for w := range wg.pool {
            links := <-w.result
            for i := 0; i < len(links); i++ {
                if _, ok := wg.urlMap[links[i].url]; !ok {
                    wg.urlMap[links[i].url] = links[i].deph

                    // dont process links that reach max deph
                    if links[i].deph < wg.maxDeph {
                        select {
                        case wg.linkQ <- links[i]:
                            // goes well
                            continue
                        default:
                            // channel is too short, protecting possible deadlock
                        }
                        // drop rest of links
                        break
                    }
                }
            }
            // empty link channel + nothing in process = end
            if len(wg.linkQ) == 0 && len(wg.pool) == 0 {
                return
            }
        }
    }()
}

//initialize goroutines and crawl url
func (wg *WorkGroup) Crawl(url string) {
    defer close(wg.pool)
    wg.spawnCrawlers()
    wg.spawnDispatcher()
    wg.linkQ <- Link{url: url, deph: 0}
    wg.Wait()
}

func (wg *WorkGroup) spawnCrawlers() {
    // custom num of workers, used maxDeph
    for i := 0; i < wg.numW; i++ {
        wg.newCrawler()
    }
}

func (wg *WorkGroup) newCrawler() {
    wg.Add(1)
    go func(w *Worker) {
        defer wg.Done()
        defer close(w.result)

        for link := range wg.linkQ {
            wg.pool <- w
            w.result <- getExternalUrls(link)
        }
    }(newWorker())
}

// default sligtly modified crawl function
func getExternalUrls(source Link) []Link {
    resp, err := http.Get(source.url)
    if err != nil {
        log.Printf("Can not reach the site. Error = %v\n", err)
        return nil
    }

    b := resp.Body
    defer b.Close()

    z := html.NewTokenizer(b)

    links := []Link{}

    for {
        token := z.Next()

        switch token {
        case html.ErrorToken:
            return links
        case html.StartTagToken:
            current := z.Token()
            if current.Data != "a" {
                continue
            }
            url, ok := getHrefTag(current)
            if ok && strings.HasPrefix(url, "http") {
                links = append(links, Link{url: url, deph: source.deph + 1})
            }
        }
    }
    return links
}

//default function
func getHrefTag(token html.Token) (result string, ok bool) {
    for _, a := range token.Attr {
        if a.Key == "href" {
            result = a.Val
            ok = true
            break
        }
    }
    return
}

来源：https://stackoverflow.com/questions/46386353/understanding-correct-use-of-channels-in-golang-concurrent-context

标签

channel