How to write a crawler?

后端 未结 10 1863
感情败类
感情败类 2020-12-02 03:47

I have had thoughts of trying to write a simple crawler that might crawl and produce a list of its findings for our NPO\'s websites and content.

Does anybody have an

10条回答
  •  悲&欢浪女
    2020-12-02 04:21

    i did a simple web crawler using reactive extension in .net.

    https://github.com/Misterhex/WebCrawler

    public class Crawler
        {
        class ReceivingCrawledUri : ObservableBase
        {
            public int _numberOfLinksLeft = 0;
    
            private ReplaySubject _subject = new ReplaySubject();
            private Uri _rootUri;
            private IEnumerable _filters;
    
            public ReceivingCrawledUri(Uri uri)
                : this(uri, Enumerable.Empty().ToArray())
            { }
    
            public ReceivingCrawledUri(Uri uri, params IUriFilter[] filters)
            {
                _filters = filters;
    
                CrawlAsync(uri).Start();
            }
    
            protected override IDisposable SubscribeCore(IObserver observer)
            {
                return _subject.Subscribe(observer);
            }
    
            private async Task CrawlAsync(Uri uri)
            {
                using (HttpClient client = new HttpClient() { Timeout = TimeSpan.FromMinutes(1) })
                {
                    IEnumerable result = new List();
    
                    try
                    {
                        string html = await client.GetStringAsync(uri);
                        result = CQ.Create(html)["a"].Select(i => i.Attributes["href"]).SafeSelect(i => new Uri(i));
                        result = Filter(result, _filters.ToArray());
    
                        result.ToList().ForEach(async i =>
                        {
                            Interlocked.Increment(ref _numberOfLinksLeft);
                            _subject.OnNext(i);
                            await CrawlAsync(i);
                        });
                    }
                    catch
                    { }
    
                    if (Interlocked.Decrement(ref _numberOfLinksLeft) == 0)
                        _subject.OnCompleted();
                }
            }
    
            private static List Filter(IEnumerable uris, params IUriFilter[] filters)
            {
                var filtered = uris.ToList();
                foreach (var filter in filters.ToList())
                {
                    filtered = filter.Filter(filtered);
                }
                return filtered;
            }
        }
    
        public IObservable Crawl(Uri uri)
        {
            return new ReceivingCrawledUri(uri, new ExcludeRootUriFilter(uri), new ExternalUriFilter(uri), new AlreadyVisitedUriFilter());
        }
    
        public IObservable Crawl(Uri uri, params IUriFilter[] filters)
        {
            return new ReceivingCrawledUri(uri, filters);
        }
    }
    

    and you can use it as follows:

    Crawler crawler = new Crawler();
    IObservable observable = crawler.Crawl(new Uri("http://www.codinghorror.com/"));
    observable.Subscribe(onNext: Console.WriteLine, 
    onCompleted: () => Console.WriteLine("Crawling completed"));
    

提交回复
热议问题