parse/modules/bibtex/text.js

/**
 * @module input/bibtex
 */

import TokenStack from '../../../util/stack'

/**
 * Mapping of BibTeX Escaped Chars to Unicode.
 *
 * From [Zotero's reversed mapping table](https://github.com/zotero/translators/blob/master/BibTeX.js#L2353)
 * [REPO](https://github.com/zotero/translators)
 *
 * Accesed 11/09/2016
 *
 * @access private
 * @constant varBibTeXTokens
 * @default
 */
import varBibTeXTokens from './tokens.json'

/**
 * Match any BibTeX token.
 *
 *     new RegExp(
 *       // word commands
 *       '\\\\url|\\\\href|' +
 *       '{\\\\[a-zA-Z]+}|\\$\\\\[a-zA-Z]+\\$|' +
 *       // math
 *       '\\$[_^]{[0-9()+\\-=n]}\\$|'+
 *       // symbols
 *       '`{2,3}|\'{2,3}|-{2,3}|[!?]!|!\\?|{\\\\~}|' +
 *       // escaped symbols
 *       '\\\\[#$%&~_^\\\\{}]|' +
 *       // diacritics
 *       '{\\\\(?:[a-z] |[`"\'^~=.])\\\\?[a-zA-Z]}|' +
 *       // non-breaking space
 *       '[\\s\\S]', 'g')
 *
 * @access private
 * @constant tokenPattern
 * @default
 */
const tokenPattern = /\\url|\\href|{\\[a-zA-Z]+}|\$\\[a-zA-Z]+\$|\$[_^]{[0-9()+=\-n]}\$|`{2,3}|'{2,3}|-{2,3}|[!?]!|!\?|{\\~}|\\[#$%&~_^\\{}]|{\\(?:[a-z] |[`"'^~=.])\\?[a-zA-Z]}|[\s\S]/g

const whitespace = /^\s$/
const syntax = /^[@{}"=,\\]$/
const delimiters = {
  '"': '"',
  '{': '}',
  '': ''
}

/**
 * Tokenize a BibTeX string
 *
 * @access private
 * @method getTokenizedBibtex
 *
 * @param {String} str - Input BibTeX
 *
 * @return {Array<String>} list of tokens
 */
const getTokenizedBibtex = function (str) {
  // Substitute command of form "\X{X}" into "{\X X}"
  str = str
    .replace(/(\\[`"'^~=.]){\\?([A-Za-z])}/g, '{$1$2}')
    .replace(/(\\[a-z]) ?{\\?([A-Za-z])}/g, '{$1 $2}')

  // Tokenize, with escaped characters in mind
  return str.match(tokenPattern)
}

/**
 * Format BibTeX data
 *
 * @access protected
 * @method parseBibTeX
 *
 * @param {String} str - The input data
 *
 * @return {Array<CSL>} The formatted input data
 */
const parseBibTeX = function (str) {
  const entries = []
  const tokens = getTokenizedBibtex(str)
  const stack = new TokenStack(tokens)

  try {
    stack.consumeWhitespace()

    while (stack.tokensLeft()) {
      stack.consumeToken('@', {spaced: false})
      stack.consumeWhitespace()

      const type = stack.consume([whitespace, syntax], {inverse: true}).toLowerCase()

      stack.consumeToken('{')

      const label = stack.consume([whitespace, syntax], {inverse: true})

      stack.consumeToken(',')

      const properties = {}

      while (stack.tokensLeft()) {
        const key = stack.consume([whitespace, '='], {inverse: true}).toLowerCase()

        stack.consumeToken('=')

        const startDelimiter = stack.consume(/^({|"|)$/g)
        const endDelimiter = delimiters[startDelimiter]

        if (!delimiters.hasOwnProperty(startDelimiter)) {
          throw new SyntaxError(`Unexpected field delimiter at index ${stack.index}. Expected ` +
            `${Object.keys(delimiters).map(v => `"${v}"`).join(', ')}; got "${startDelimiter}"`)
        }

        const tokenMap = token => {
          if (varBibTeXTokens.hasOwnProperty(token)) {
            return varBibTeXTokens[token]
          } else if (token.match(/^\\[#$%&~_^\\{}]$/)) {
            return token.slice(1)
          } else if (token.length > 1) {
            throw new SyntaxError(`Escape sequence not recognized: ${token}`)
          } else {
            return token
          }
        }

        let openBrackets = 0
        const val = stack.consume((token, index) => {
          if (token === '{') {
            openBrackets++
          }

          if (stack.tokensLeft() < endDelimiter.length) {
            throw new SyntaxError(`Unmatched delimiter at index ${stack.index}: Expected ${endDelimiter}`)
          } else if (!endDelimiter.length) {
            return ![whitespace, syntax].some(rgx => rgx.test(token))
          } else {
            return (token === '}' && openBrackets--) || !stack.matchesSequence(endDelimiter)
          }
        }, {tokenMap})

        properties[key] = val

        stack.consumeN(endDelimiter.length)
        stack.consumeWhitespace()

        // Last entry (no trailing comma)
        if (stack.matches('}')) { break }

        stack.consumeToken(',', {spaced: false})
        stack.consumeWhitespace()

        // Last entry (trailing comma)
        if (stack.matches('}')) { break }
      }

      stack.consumeToken('}', {spaced: false})
      stack.consumeWhitespace()

      entries.push({type, label, properties})
    }
  } catch (e) {
    logger.error(`Uncaught SyntaxError: ${e.message}. Returning completed entries.`)

    // Remove last, possibly incomplete entry
    entries.pop()
  }

  return entries
}

export {
  parseBibTeX as parse,
  parseBibTeX as default
}