PHP's str_word_count in JavaScript

How to use

You you can install via yarn add locutus and require this function via const str_word_count = require('locutus/php/strings/str_word_count').

It is important to use a bundler that supports tree-shaking so that you only ship the functions that you actually use to your browser, instead of all of Locutus, which is massive. Examples are: Parcel, webpack, or rollup.js. For server-side use this is typically less of a concern.

Examples

Please note that these examples are distilled from test cases that automatically verify our functions still work correctly. This could explain some quirky ones.

#	code	expected result
1	`str_word_count("Hello fri3nd, you're\r\n looking good today!", 1)`	`['Hello', 'fri', 'nd', "you're", 'looking', 'good', 'today']`
2	`str_word_count("Hello fri3nd, you're\r\n looking good today!", 2)`	`{0: 'Hello', 6: 'fri', 10: 'nd', 14: "you're", 29: 'looking', 46: 'good', 51: 'today'}`
3	`str_word_count("Hello fri3nd, you're\r\n looking good today!", 1, '\u00e0\u00e1\u00e3\u00e73')`	`['Hello', 'fri3nd', "you're", 'looking', 'good', 'today']`
4	`str_word_count('hey', 2)`	`{0: 'hey'}`

Here’s what our current JavaScript equivalent to PHP's str_word_count looks like.

module.exports = function str_word_count(str, format, charlist) {
  //  discuss at: https://locutus.io/php/str_word_count/
  // original by: Ole Vrijenhoek
  // bugfixed by: Kevin van Zonneveld (https://kvz.io)
  // bugfixed by: Brett Zamir (https://brett-zamir.me)
  // bugfixed by: Brett Zamir (https://brett-zamir.me)
  //    input by: Bug?
  // improved by: Brett Zamir (https://brett-zamir.me)
  //   example 1: str_word_count("Hello fri3nd, you're\r\n       looking          good today!", 1)
  //   returns 1: ['Hello', 'fri', 'nd', "you're", 'looking', 'good', 'today']
  //   example 2: str_word_count("Hello fri3nd, you're\r\n       looking          good today!", 2)
  //   returns 2: {0: 'Hello', 6: 'fri', 10: 'nd', 14: "you're", 29: 'looking', 46: 'good', 51: 'today'}
  //   example 3: str_word_count("Hello fri3nd, you're\r\n       looking          good today!", 1, '\u00e0\u00e1\u00e3\u00e73')
  //   returns 3: ['Hello', 'fri3nd', "you're", 'looking', 'good', 'today']
  //   example 4: str_word_count('hey', 2)
  //   returns 4: {0: 'hey'}

  const ctypeAlpha = require('../ctype/ctype_alpha')
  const len = str.length
  const cl = charlist && charlist.length
  let chr = ''
  let tmpStr = ''
  let i = 0
  let c = ''
  const wArr = []
  let wC = 0
  const assoc = {}
  let aC = 0
  let reg = ''
  let match = false

  const _pregQuote = function (str) {
    return (str + '').replace(/([\\.+*?[^\]$(){}=!<>|:])/g, '\\$1')
  }
  const _getWholeChar = function (str, i) {
    // Use for rare cases of non-BMP characters
    const code = str.charCodeAt(i)
    if (code < 0xd800 || code > 0xdfff) {
      return str.charAt(i)
    }
    if (code >= 0xd800 && code <= 0xdbff) {
      // High surrogate (could change last hex to 0xDB7F to treat high private surrogates as single
      // characters)
      if (str.length <= i + 1) {
        throw new Error('High surrogate without following low surrogate')
      }
      const next = str.charCodeAt(i + 1)
      if (next < 0xdc00 || next > 0xdfff) {
        throw new Error('High surrogate without following low surrogate')
      }
      return str.charAt(i) + str.charAt(i + 1)
    }
    // Low surrogate (0xDC00 <= code && code <= 0xDFFF)
    if (i === 0) {
      throw new Error('Low surrogate without preceding high surrogate')
    }
    const prev = str.charCodeAt(i - 1)
    if (prev < 0xd800 || prev > 0xdbff) {
      // (could change last hex to 0xDB7F to treat high private surrogates as single characters)
      throw new Error('Low surrogate without preceding high surrogate')
    }
    // We can pass over low surrogates now as the second component in a pair which we have already
    // processed
    return false
  }

  if (cl) {
    reg = '^(' + _pregQuote(_getWholeChar(charlist, 0))
    for (i = 1; i < cl; i++) {
      if ((chr = _getWholeChar(charlist, i)) === false) {
        continue
      }
      reg += '|' + _pregQuote(chr)
    }
    reg += ')$'
    reg = new RegExp(reg)
  }

  for (i = 0; i < len; i++) {
    if ((c = _getWholeChar(str, i)) === false) {
      continue
    }
    // No hyphen at beginning or end unless allowed in charlist (or locale)
    // No apostrophe at beginning unless allowed in charlist (or locale)
    // @todo: Make this more readable
    match =
      ctypeAlpha(c) ||
      (reg && c.search(reg) !== -1) ||
      (i !== 0 && i !== len - 1 && c === '-') ||
      (i !== 0 && c === "'")
    if (match) {
      if (tmpStr === '' && format === 2) {
        aC = i
      }
      tmpStr = tmpStr + c
    }
    if (i === len - 1 || (!match && tmpStr !== '')) {
      if (format !== 2) {
        wArr[wArr.length] = tmpStr
      } else {
        assoc[aC] = tmpStr
      }
      tmpStr = ''
      wC++
    }
  }

  if (!format) {
    return wC
  } else if (format === 1) {
    return wArr
  } else if (format === 2) {
    return assoc
  }

  throw new Error('You have supplied an incorrect format')
}

A community effort

Not unlike Wikipedia, Locutus is an ongoing community effort. Our philosophy follows The McDonald’s Theory. This means that we assimilate first iterations with imperfections, hoping for others to take issue with-and improve them. This unorthodox approach has worked very well to foster fun and fruitful collaboration, but please be reminded to use our creations at your own risk. THE SOFTWARE IS PROVIDED "AS IS" has never been more true than for Locutus.

Now go and: [ View on GitHub | Edit on GitHub | View Raw ]

« More PHP strings functions

Star