Make an existing code in Java parallel/multithread

后端 未结 3 2033
孤城傲影
孤城傲影 2021-01-05 03:37

I have a very simple crawler. I want to make my current code run in a few threads. Could you provide me a little tutorial or article to help me achive this test?

I\'

3条回答
  •  渐次进展
    2021-01-05 03:52

    You can take a look at my webcrawler example. Sry for the lengthiness.

    import java.net.MalformedURLException;
    import java.util.ArrayList;
    import java.util.HashSet;
    import java.util.List;
    import java.util.Set;
    
    /**
     * A web crawler with a Worker pool
     * 
     * @author Adriaan
     */
    public class WebCrawler implements Manager {
    
            private Set workers = new HashSet();
            private List toCrawl = new ArrayList();
            private Set crawled = new HashSet();
            private Set hosts = new HashSet();
            private Set results = new HashSet();
            private int maxResults;
    
            public WebCrawler(String url, int numberOfWorkers, int maxResults) {
                    this.maxResults = maxResults;
                    toCrawl.add(url);
                    createWorkers(numberOfWorkers);
            }
    
            public void createWorkers(int numberOfWorkers) {
                    for (int i = 0; i < numberOfWorkers; i++) {
                            workers.add(new Worker(this));
                    }
            }
    
            private void stopWorkers() {
                    for (Worker worker : workers) {
                            worker.terminate();
                    }
            }
    
            public synchronized Job getNewJob() {
                    while (toCrawl.size() == 0) {
                            try {
                                    wait();
                            } catch (InterruptedException e) {
                                    // ignore
                            }
                    }
                    return new EmailAddressCrawlJob().setDescription(toCrawl.remove(0));
            }
    
            public synchronized void jobCompleted(Job job) {
                    // System.out.println("crawled: " + job.getDescription());
                    crawled.add(job.getDescription());
                    String host = getHost(job.getDescription());
                    boolean knownHost = hosts.contains(host);
                    if (!knownHost) {
                            System.out.println("host: " + host);
                            hosts.add(host);
                    }
                    for (String url : job.getNewDescriptions()) {
                            if (!crawled.contains(url)) {
                                    if (knownHost) {
                                            toCrawl.add(toCrawl.size() - 1, url);
                                    } else {
                                            toCrawl.add(url);
                                    }
                            }
                    }
                    for (String result : job.getResults()) {
                            if (results.add(result)) {
                                    System.out.println("result: " + result);
                            }
                    }
                    notifyAll();
                    if (results.size() >= maxResults) {
                            stopWorkers();
                            System.out.println("Crawled hosts:");
                            for (String crawledHost : hosts) {
                                    System.out.println(crawledHost);
                            }
                            Set uncrawledHosts = new HashSet();
                            for (String toCrawlUrl : toCrawl) {
                                    uncrawledHosts.add(getHost(toCrawlUrl));
                            }
                            System.out.println("Uncrawled hosts:");
                            for (String unCrawledHost : uncrawledHosts) {
                                    System.out.println(unCrawledHost);
                            }
                    }
                    if (crawled.size() % 10 == 0) {
                            System.out.println("crawled=" + crawled.size() + " toCrawl="
                                            + toCrawl.size() + " results=" + results.size() + " hosts="
                                            + hosts.size() + " lastHost=" + host);
                    }
            }
    
            public String getHost(String host) {
                    int hostStart = host.indexOf("://") + 3;
                    if (hostStart > 0) {
                            int hostEnd = host.indexOf("/", hostStart);
                            if (hostEnd < 0) {
                                    hostEnd = host.length();
                            }
                            host = host.substring(hostStart, hostEnd);
                    }
                    return host;
            }
    
            public static void main(String[] args) throws MalformedURLException {
                    new WebCrawler("http://www.nu.nl/", 5, 20);
            }
    }
    

    Worker

    **
     * A Worker proactively gets a Job, executes it and notifies its manager that
     * the Job is completed.
     * 
     * @author Adriaan
     */
    public class Worker extends Thread {
    
            private final Manager manager;
            private Job job = null;
            private boolean isWorking;
    
            public Worker(Manager manager) {
                    this.manager = manager;
                    isWorking = true;
                    start();
            }
    
            @Override
            public void run() {
                    System.out.println("Worker " + Thread.currentThread().getId()
                                    + " starting ");
                    while (isWorking) {
                            job = manager.getNewJob();
                            job.execute();
                            manager.jobCompleted(job);
                    }
            }
    
            public void terminate() {
                    isWorking = false;
            }
    }
    

    Manager interface

    /**
     * Manager interface for Workers
     * 
     * @author Adriaan
     */
    public interface Manager {
    
            /**
             * Gets a new job
             * 
             * @return
             */
            public Job getNewJob();
    
            /**
             * Indicates the job is completed
             * 
             * @param job
             */
            public void jobCompleted(Job job);
    }
    

    Job

    import java.util.HashSet;
    import java.util.Set;
    
    /**
     * A Job is a unit of work defined by a String (the description). During execution the 
     * job can obtain results and new job descriptions.
     *
     * @author Adriaan
     */
    public abstract class Job {
    
            private String description;
            private Set results = new HashSet();
            private Set newDescriptions = new HashSet();
    
            /**
             * Sets the job description
             * 
             * @param description
             * @return this for chaining
             */
            public Job setDescription(String description) {
                    this.description = description;
                    return this;
            }
    
            /**
             * Executes the job
             */
            public abstract void execute();
    
            /**
             * Gets the results obtained
             * 
             * @return
             */
            public Set getResults() {
                    return results;
            }
    
            /**
             * Gets the now job descriptions obtained
             * 
             * @return
             */
            public Set getNewDescriptions() {
                    return newDescriptions;
            }
    
            /**
             * Gets the job description
             * 
             * @return
             */
            public String getDescription() {
                    return description;
            }
    
            /**
             * Allows the implementation to add an obtained result
             * 
             * @param result
             */
            void addResult(String result) {
                    results.add(result);
            }
    
            /**
             * Allows the implementation to add an obtained description
             * 
             * @param result
             */
            void addNewDescription(String newDescription) {
                    newDescriptions.add(newDescription);
            }
    }
    

    A Job which crawls a page for email addresses:

    import java.io.IOException;
    import java.io.InputStream;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.StringTokenizer;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    /**
     * A Job which crawls HTTP or HTTPS URL's for email adresses, collecting new
     * URL's to crawl along the way.
     * 
     * @author Adriaan
     */
    public class EmailAddressCrawlJob extends Job {
    
            @Override
            public void execute() {
                    try {
                            URL url = new URL(getDescription());
                            if (url != null) {
                                    String text = readText(url);
                                    extractNewDescriptions(text, url);
                                    extractResults(text);
                            }
                    } catch (MalformedURLException e) {
                            System.err.println("Bad url " + getDescription());
                    }
            }
    
            private String readText(URL url) {
                    URLConnection connection;
                    try {
                            connection = url.openConnection();
                            InputStream input = connection.getInputStream();
                            byte[] buffer = new byte[1000];
                            int num = input.read(buffer);
                            if (num > 0) {
                                    StringBuilder builder = new StringBuilder();
                                    builder.append(new String(buffer, 0, num));
                                    while (num != -1) {
                                            num = input.read(buffer);
                                            if (num != -1) {
                                                    builder.append(new String(buffer, 0, num));
                                            }
                                    }
                                    return builder.toString();
                            }
                    } catch (IOException e) {
                            //System.err.println("Could not read from " + url);
                    }
                    return "";
            }
    
            private void extractNewDescriptions(String text, URL url) {
    
                    // URL extracting code from Sun example
                    String lowerCaseContent = text.toLowerCase();
                    int index = 0;
                    while ((index = lowerCaseContent.indexOf("#");
                            String strLink = st.nextToken();
    
                            if (strLink.startsWith("javascript:")) {
                                    continue;
                            }
    
                            URL urlLink;
                            try {
                                    urlLink = new URL(url, strLink);
                                    strLink = urlLink.toString();
                            } catch (MalformedURLException e) {
                                    // System.err.println("Could not create url: " + target
                                    // + " + " + strLink);
                                    continue;
                            }
                            // only look at http links
                            String protocol = urlLink.getProtocol();
                            if (protocol.compareTo("http") != 0
                                            && protocol.compareTo("https") != 0) {
                                    // System.err.println("Ignoring: " + protocol
                                    // + " protocol in " + urlLink);
                                    continue;
                            }
                            addNewDescription(urlLink.toString());
                    }
            }
    
            private void extractResults(String text) {
                    Pattern p = Pattern
                                    .compile("([\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Za-z]{2,4})");
                    Matcher m = p.matcher(text);
                    while (m.find()) {
                            addResult(m.group(1));
                    }
            }
    }
    

    I know this answer is a bit verbose, but I thought OP might be best helped with a working example and I happened to have made one not so long ago.

提交回复
热议问题