问题
I am writing a go project which is a simple web crawler to crawl links on the website. I want to experiment the concurrent features such as goroutines and channels. But when I run it it didn't go through. Nothing is showed as if there is nothing happening. I have no idea what went wrong. Can somebody point it out for me?
It works and shows all the crawled links if I remove the channels logic but I want it to send the links into a buffered channel and then display the links before ending the program. The program is supposed to be able to go to any depth as specified in the program. Currently the depth is 1.
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"time"
"golang.org/x/net/html"
)
// Link type to be sent over channel
type Link struct {
URL string
ok bool
}
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
ch := make(chan *Link, 5)
crawl(url, 1, ch)
visited := make(map[string]bool)
time.Sleep(2 * time.Second)
for link := range ch {
if _, ok := visited[link.URL]; !ok {
visited[link.URL] = true
}
}
close(ch)
for l := range visited {
fmt.Println(l)
}
}
func crawl(url string, n int, ch chan *Link) {
if n < 1 {
return
}
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Can not reach the site. Error = %v\n", err)
os.Exit(1)
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
nextN := n - 1
for {
token := z.Next()
switch token {
case html.ErrorToken:
return
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
result, ok := getHrefTag(current)
if !ok {
continue
}
hasProto := strings.HasPrefix(result, "http")
if hasProto {
go crawl(result, nextN, ch)
ch <- &Link{result, true}
}
}
}
}
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}
UPDATED:
After some fiddling I figured out to change the code to remove the data races, however I still don't know how to avoid crawling urls that were visited previously (maybe I should start another question?):
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"golang.org/x/net/html"
)
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
for link := range newCrawl(url, 1) {
fmt.Println(link)
}
}
func newCrawl(url string, num int) chan string {
ch := make(chan string, 20)
go func() {
crawl(url, 1, ch)
close(ch)
}()
return ch
}
func crawl(url string, n int, ch chan string) {
if n < 1 {
return
}
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Can not reach the site. Error = %v\n", err)
os.Exit(1)
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
nextN := n - 1
for {
token := z.Next()
switch token {
case html.ErrorToken:
return
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
result, ok := getHrefTag(current)
if !ok {
continue
}
hasProto := strings.HasPrefix(result, "http")
if hasProto {
done := make(chan struct{})
go func() {
crawl(result, nextN, ch)
close(done)
}()
<-done
ch <- result
}
}
}
}
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}
回答1:
I think that recursive calling of goroutines is not good idea. It can simply goes out of control.. I would prefer more flat model like this:
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"sync"
"golang.org/x/net/html"
)
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
wg := NewWorkGroup(1)
wg.Crawl(url)
for k, v := range wg.urlMap {
fmt.Printf("%s: %d\n", k, v)
}
}
// represents single link and its deph
type Link struct {
url string
deph uint32
}
// wraps all around to group
type WorkGroup struct {
*sync.WaitGroup
maxDeph uint32
numW int
pool chan *Worker
linkQ chan Link
urlMap map[string]uint32
}
type Worker struct {
result chan []Link
}
func newWorker() *Worker {
return &Worker{
result: make(chan []Link),
}
}
func NewWorkGroup(maxDeph uint32) *WorkGroup {
numW := int(maxDeph)
if maxDeph > 10 {
numW = 10
}
return &WorkGroup{
WaitGroup: new(sync.WaitGroup),
maxDeph: maxDeph,
numW: numW,
pool: make(chan *Worker, numW),
linkQ: make(chan Link, 100),
urlMap: make(map[string]uint32),
}
}
// dispatch workers -> filter visited -> send not visited to channel
// pool + dispatcher keep order so workers go level by level
func (wg *WorkGroup) spawnDispatcher() {
wg.Add(1)
go func() {
defer wg.Done()
defer close(wg.linkQ)
for w := range wg.pool {
links := <-w.result
for i := 0; i < len(links); i++ {
if _, ok := wg.urlMap[links[i].url]; !ok {
wg.urlMap[links[i].url] = links[i].deph
// dont process links that reach max deph
if links[i].deph < wg.maxDeph {
select {
case wg.linkQ <- links[i]:
// goes well
continue
default:
// channel is too short, protecting possible deadlock
}
// drop rest of links
break
}
}
}
// empty link channel + nothing in process = end
if len(wg.linkQ) == 0 && len(wg.pool) == 0 {
return
}
}
}()
}
//initialize goroutines and crawl url
func (wg *WorkGroup) Crawl(url string) {
defer close(wg.pool)
wg.spawnCrawlers()
wg.spawnDispatcher()
wg.linkQ <- Link{url: url, deph: 0}
wg.Wait()
}
func (wg *WorkGroup) spawnCrawlers() {
// custom num of workers, used maxDeph
for i := 0; i < wg.numW; i++ {
wg.newCrawler()
}
}
func (wg *WorkGroup) newCrawler() {
wg.Add(1)
go func(w *Worker) {
defer wg.Done()
defer close(w.result)
for link := range wg.linkQ {
wg.pool <- w
w.result <- getExternalUrls(link)
}
}(newWorker())
}
// default sligtly modified crawl function
func getExternalUrls(source Link) []Link {
resp, err := http.Get(source.url)
if err != nil {
log.Printf("Can not reach the site. Error = %v\n", err)
return nil
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
links := []Link{}
for {
token := z.Next()
switch token {
case html.ErrorToken:
return links
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
url, ok := getHrefTag(current)
if ok && strings.HasPrefix(url, "http") {
links = append(links, Link{url: url, deph: source.deph + 1})
}
}
}
return links
}
//default function
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}
来源:https://stackoverflow.com/questions/46386353/understanding-correct-use-of-channels-in-golang-concurrent-context