Wikipedia page parsing program caught in endless graph cycle

不打扰是莪最后的温柔 提交于 2020-01-05 09:35:34

问题


My program is caught in a cycle that never ends, and I can't see how it get into this trap, or how to avoid it.

It's parsing Wikipedia data and I think it's just following a connected component around and around.

Maybe I can store the pages I've visited already in a set and if a page is in that set I won't go back to it?

This is my project, its quite small, only three short classes.

This is a link to the data it generates, I stopped it short, otherwise it would have gone on and on.

This is the laughably small toy input that generated that mess.

It's the same project I was working on when I asked this question.

What follows is the entirety of the code.

The main class:

 public static void main(String[] args) throws Exception 
    {

        String name_list_file = "/home/matthias/Workbench/SUTD/nytimes_corpus/NYTimesCorpus/2005/01/02/test/people_test.txt";

        String single_name;

        try (   
                // read in the original file, list of names, w/e
                InputStream stream_for_name_list_file = new FileInputStream( name_list_file );
                InputStreamReader stream_reader = new InputStreamReader( stream_for_name_list_file , Charset.forName("UTF-8"));
                BufferedReader line_reader = new BufferedReader( stream_reader );
            ) 
        {
            while (( single_name = line_reader.readLine() ) != null) 
            {
                //replace this by a URL encoder
                //String associated_alias = single_name.replace(' ', '+');
                String associated_alias = URLEncoder.encode( single_name , "UTF-8");

                String platonic_key = single_name;
                System.out.println("now processing: " + platonic_key);

                Wikidata_Q_Reader.getQ( platonic_key, associated_alias );
            }
        }

        //print the struc
        Wikidata_Q_Reader.print_data();

    }

The Wikipedia reader / value grabber:

static Map<String, HashSet<String> > q_valMap = new HashMap<String, HashSet<String> >();

//public static String[] getQ(String variable_entity) throws Exception
public static void getQ( String platonic_key, String associated_alias ) throws Exception
{


    //get the corresponding wikidata page
    //check the validity of the URL
    String URL_czech = "https://www.wikidata.org/wiki/Special:ItemByTitle?site=en&page=" + associated_alias + "&submit=Search";
    URL wikidata_page = new URL(URL_czech);
    HttpURLConnection wiki_connection = (HttpURLConnection)wikidata_page.openConnection();
    InputStream wikiInputStream = null;


    try 
    {
        // try to connect and use the input stream
        wiki_connection.connect();
        wikiInputStream = wiki_connection.getInputStream();
    } 
    catch(IOException e) 
    {
        // failed, try using the error stream
        wikiInputStream = wiki_connection.getErrorStream();
    }



        BufferedReader wiki_data_pagecontent = new BufferedReader(
                                                   new InputStreamReader(
                                                        wikiInputStream ));
        String line_by_line;



        while ((line_by_line = wiki_data_pagecontent.readLine()) != null) 
        {
            // if we can determine it's a disambig page we need to send it off to get all 
            // the possible senses in which it can be used.
            Pattern disambig_pattern = Pattern.compile("<div class=\"wikibase-entitytermsview-heading-description \">Wikipedia disambiguation page</div>");
            Matcher disambig_indicator = disambig_pattern.matcher(line_by_line);
            if (disambig_indicator.matches()) 
            {
                //off to get the different usages
                Wikipedia_Disambig_Fetcher.all_possibilities(  platonic_key, associated_alias );
            }
            else
            {
                //get the Q value off the page by matching
                Pattern q_page_pattern = Pattern.compile("<!-- wikibase-toolbar --><span class=\"wikibase-toolbar-container\"><span class=\"wikibase-toolbar-item " +
                        "wikibase-toolbar \">\\[<span class=\"wikibase-toolbar-item wikibase-toolbar-button wikibase-toolbar-button-edit\"><a " +
                        "href=\"/wiki/Special:SetSiteLink/(.*?)\">edit</a></span>\\]</span></span>");

                Matcher match_Q_component = q_page_pattern.matcher(line_by_line);
                if ( match_Q_component.matches() ) 
                {
                    String Q = match_Q_component.group(1);

                    // 'Q' should be appended to an array, since each entity can hold multiple
                    // Q values on that basis of disambig
                    put_to_hash( platonic_key, Q );
                }
            }

        }
        wiki_data_pagecontent.close();

        // \\ // ! PRINT IT ! // \\ // \\ // \\ // \\ // \\ // \\
        for (Map.Entry<String, HashSet<String> > entry : q_valMap.entrySet()) 
        {
            System.out.println(entry.getKey()+" : " + Arrays.deepToString(q_valMap.entrySet().toArray()) );
        }



}

// add Q values to their arrayList in the hash map at the index of the appropriate entity
public static HashSet<String> put_to_hash(String key, String value ) 
{
    HashSet<String> valSet;
    if (q_valMap.containsKey(key)) {
        valSet = q_valMap.get(key);
    } else {
        valSet = new HashSet<String>();
        q_valMap.put(key, valSet);
    } 
    valSet.add(value);
    return valSet;
}


// add Q values to their arrayList in the hash map at the index of the appropriate entity
public static void print_data() 
{
    System.out.println("THIS IS THE FINAL DATA SET!!!");
    // \\ // ! PRINT IT ! // \\ // \\ // \\ // \\ // \\ // \\
    for (Map.Entry<String, HashSet<String> > entry : q_valMap.entrySet()) 
    {
        System.out.println(entry.getKey()+" : " + Arrays.deepToString(q_valMap.entrySet().toArray()) );
    }
}

Dealing with disambiguation pages:

public static void all_possibilities( String platonic_key, String associated_alias ) throws Exception
{
    System.out.println("this is a disambig page");
    //if it's a disambig page we know we can go right to the Wikipedia


    //get it's normal wiki disambig page
    String URL_czech = "https://en.wikipedia.org/wiki/" + associated_alias;
    URL wikidata_page = new URL(URL_czech);
    HttpURLConnection wiki_connection = (HttpURLConnection)wikidata_page.openConnection();
    InputStream wikiInputStream = null;


    try 
    {
        // try to connect and use the input stream
        wiki_connection.connect();
        wikiInputStream = wiki_connection.getInputStream();
    } 
    catch(IOException e) 
    {
        // failed, try using the error stream
        wikiInputStream = wiki_connection.getErrorStream();
    }
    // parse the input stream using Jsoup
    Document docx = Jsoup.parse(wikiInputStream, null, wikidata_page.getProtocol()+"://"+wikidata_page.getHost()+"/");


    //this can handle the less structured ones. 
    Elements linx = docx.select( "p:contains(" + associated_alias + ") ~ ul a:eq(0)" );

    for (Element linq : linx) 
    {
        System.out.println(linq.text());

        String linq_nospace = URLEncoder.encode( linq.text() , "UTF-8");
        Wikidata_Q_Reader.getQ( platonic_key, linq_nospace );

    }




}

来源:https://stackoverflow.com/questions/29822960/wikipedia-page-parsing-program-caught-in-endless-graph-cycle

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!