How to retrieve Wiktionary word content?

前端 未结 9 1630
花落未央
花落未央 2020-11-28 18:27

How may Wiktionary\'s API be used to determine whether or not a word exists?

相关标签:
9条回答
  • 2020-11-28 18:59

    You can download a dump of Wikitionary data. There's more information in the FAQ. For your purposes, the definitions dump is probably a better choice than the xml dump.

    0 讨论(0)
  • 2020-11-28 18:59

    To keep it really simple, extract the words from the dump like that:

    bzcat pages-articles.xml.bz2 | grep '<title>[^[:space:][:punct:]]*</title>' | sed 's:.*<title>\(.*\)</title>.*:\1:' > words
    
    0 讨论(0)
  • 2020-11-28 18:59

    Here's a start to parsing etymology and pronunciation data:

    function parsePronunciationLine(line) {
      let val
      let type
      line.replace(/\{\{\s*a\s*\|UK\s*\}\}\s*\{\{IPA\|\/?([^\/\|]+)\/?\|lang=en\}\}/, (_, $1) => {
        val = $1
        type = 'uk'
      })
      line.replace(/\{\{\s*a\s*\|US\s*\}\}\s*\{\{IPA\|\/?([^\/\|]+)\/?\|lang=en\}\}/, (_, $1) => {
        val = $1
        type = 'us'
      })
      line.replace(/\{\{enPR|[^\}]+\}\},?\s*\{\{IPA\|\/?([^\/\|]+)\/?\|lang=en}}/, (_, $1) => {
        val = $1
        type = 'us'
      })
      line.replace(/\{\{a|GA\}\},?\s*\{\{IPA\|\/?([^\/\|]+)\/?\|lang=en}}/, (_, $1) => {
        val = $1
        type = 'ga'
      })
      line.replace(/\{\{a|GA\}\},?.+\{\{IPA\|\/?([^\/\|]+)\/?\|lang=en}}/, (_, $1) => {
        val = $1
        type = 'ga'
      })
      // {{a|GA}} {{IPA|/ˈhæpi/|lang=en}}
      // * {{a|RP}} {{IPA|/pliːz/|lang=en}}
      // * {{a|GA}} {{enPR|plēz}}, {{IPA|/pliz/|[pʰliz]|lang=en}}
    
      if (!val) return
    
      return { val, type }
    }
    
    function parseEtymologyPiece(piece) {
      let parts = piece.split('|')
      parts.shift() // first one is ignored.
      let ls = []
      if (langs[parts[0]]) {
        ls.push(parts.shift())
      }
      if (langs[parts[0]]) {
        ls.push(parts.shift())
      }
      let l = ls.pop()
      let t = parts.shift()
      return [ l, t ]
      // {{inh|en|enm|poisoun}}
      // {{m|enm|poyson}}
      // {{der|en|la|pōtio|pōtio, pōtiōnis|t=drink, a draught, a poisonous draught, a potion}}
      // {{m|la|pōtō|t=I drink}}
      // {{der|en|enm|happy||fortunate, happy}}
      // {{cog|is|heppinn||lucky}}
    }
    

    Update: Here is a gist with it more fleshed out.

    0 讨论(0)
提交回复
热议问题