run perceptron algorithm on a hash map feature vecteur: java

前端 未结 2 1708
轮回少年
轮回少年 2021-01-17 07:52

I have the following code, it reads in many files from a directory into a hash map, this is my feature vecteur. It\'s somewhat naive in the sense that it does no st

2条回答
  •  庸人自扰
    2021-01-17 08:30

    This is the full and complete answer to my original question, posted here for the benefit of future perusers


    Given the following files:

    • atheism/a_0.txt

      Gott ist tot.
      
    • politics/p_0.txt

      L'Etat, c'est moi , et aussi moi .
      
    • science/s_0.txt

      If I have seen further it is by standing on the shoulders of giants.
      
    • sports/s_1.txt

      You miss 100% of the shots you don't take.
      
    • Output data structures:

      /data/train/politics/p_0.txt, [0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
      /data/train/science/s_0.txt, [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0]
      /data/train/atheism/a_0.txt, [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
      /data/train/sports/s_1.txt, [0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1]
      

    The code looks like this, or you can find it on my GitHub page.

    public class FileDictCreateur 
    {
        static String PATH = "/home/matthias/Workbench/SUTD/ISTD_50.570/assignments/practice_data/data/train";
    
        //the global list of all words across all articles
        static Set GLOBO_DICT = new HashSet();
    
        //is the globo dict full?
        static boolean globo_dict_fixed = false;
    
        // hash map of all the words contained in individual files
        static Map > fileDict = new HashMap<>();
    
        //input to perceptron. final struc.
        static Map perceptron_input = new HashMap<>();
    
    
        @SuppressWarnings("rawtypes")
        public static void main(String[] args) throws IOException 
        {
            //each of the diferent categories
            String[] categories = { "/atheism", "/politics", "/science", "/sports"};
    
            //cycle through all categories once to populate the global dict
            for(int cycle = 0; cycle <= 3; cycle++)
            {
                String general_data_partition = PATH + categories[cycle];
    
                File directory = new File( general_data_partition );
                iterateDirectory( directory , globo_dict_fixed);
    
                if(cycle == 3)
                    globo_dict_fixed = true;
            }
    
    
            //cycle through again to populate the file dicts
            for(int cycle = 0; cycle <= 3; cycle++)
            {
                String general_data_partition = PATH + categories[cycle];
    
                File directory = new File( general_data_partition );
                iterateDirectory( directory , globo_dict_fixed);
    
            }
    
    
    
            perceptron_data_struc_generateur( GLOBO_DICT, fileDict, perceptron_input );
    
    
    
            //print the output
            for (Map.Entry entry : perceptron_input.entrySet()) 
            {
                System.out.println(entry.getKey() + ", " + Arrays.toString(entry.getValue()));
            }
        }
    
    
    
        private static void iterateDirectory(File directory, boolean globo_dict_fixed) throws IOException 
        {
            for (File file : directory.listFiles()) 
            {
                if (file.isDirectory()) 
                {
                    iterateDirectory(directory, globo_dict_fixed);
                } 
                else 
                {   
                    String line; 
                    BufferedReader br = new BufferedReader(new FileReader( file ));
    
                    while ((line = br.readLine()) != null) 
                    {
                        String[] words = line.split(" ");//those are your words
    
                        if(globo_dict_fixed == false)
                        {
                            populate_globo_dict( words );
                        }
                        else
                        {
                            create_file_dict( file, words );
                        }
                    }
                }
            }
        }
    
        @SuppressWarnings("unchecked")
        public static void create_file_dict( File file, String[] words ) throws IOException
        {   
    
            if (!fileDict.containsKey(file))
            {
                @SuppressWarnings("rawtypes")
                ArrayList document_words = new ArrayList();
    
                String word;
    
                for (int i = 0; i < words.length; i++) 
                {
                    word = words[i];
    
                    document_words.add(word);
                }
                fileDict.put(file, document_words);
            }
        }
    
        public static void populate_globo_dict( String[] words ) throws IOException
        {
            String word;
    
            for (int i = 0; i < words.length; i++) 
            {
                word = words[i];
                if (!GLOBO_DICT.contains(word))
                {
                    GLOBO_DICT.add(word);
                }
            }   
        }
    
        public static void perceptron_data_struc_generateur(Set GLOBO_DICT, 
                                                        Map > fileDict,
                                                        Map perceptron_input)
        {
            //create a new entry in the array list 'perceptron_input'
            //with the key as the file name from fileDict
                //create a new array which is the length of GLOBO_DICT
                //iterate through the indicies of GLOBO_DICT
                    //for all words in globo dict, if that word appears in fileDict,
                    //increment the perceptron_input index that corresponds to that
                    //word in GLOBO_DICT by the number of times that word appears in fileDict
    
            //so i can get the index later
            List GLOBO_DICT_list = new ArrayList<>(GLOBO_DICT);
    
            for (Map.Entry> entry : fileDict.entrySet()) 
            {
                int[] cross_czech = new int[GLOBO_DICT_list.size()];
                //initialize to zero
                Arrays.fill(cross_czech, 0);
    
                for (String s : GLOBO_DICT_list)
                {
    
                    for(String st : entry.getValue()) 
                    {
                        if( st.equals(s) )
                        {
                            cross_czech[ GLOBO_DICT_list.indexOf( s ) ] = cross_czech[ GLOBO_DICT_list.indexOf( s ) ] +1;
                        }
                    }
                }
                perceptron_input.put( entry.getKey() , cross_czech);    
            }
        }
    }
    

提交回复
热议问题