Article
Benying Zou · Jul 13, 2017 19m read

use punycode in caché

Hi everybody, We have written a convert from Punycode. This is reformed from javascript. Perhaps many non-English systems need to use this feature in domain name resolution. Anyone can use and change the code as needed.

Class Util.Punycode Extends %RegisteredObject
{

Parameter MaxInt = 2147483647;

/// Bootstring parameters
Parameter Base = 36;

Parameter TMin = 1;

Parameter TMax = 26;

Parameter Skew = 38;

Parameter Damp = 700;

Parameter InitialBias = 72;

Parameter InitialN = 128;

Parameter Delimiter = "-";

/// Regular expressions
Parameter RegexPunycode = "^xn--";

/// non-ASCII chars
Parameter RegexNonASCII = "[^\x00-\x7E]";

/// RFC 3490 separators
Parameter RegexSeparators = "[\x2E\u3002\uFF0E\uFF61]";

/// A generic error utility function.<br>
/// @private<br>
/// @param {String} type The error type.<br>
/// @returns {Error} Throws a `RangeError` with the applicable error message.
Method Error(type) [ Private ]
{
  If (type = "overflow")
  {
    Throw $$$ERROR("Overflow: input needs wider integers to process")
    Quit
  }

  If (type = "not-basic")
  {
    Throw $$$ERROR("Illegal input >= 0x80 (not a basic code point")
    Quit
  }

  If (type = "invalid-input")
  {
    Throw $$$ERROR("invalid-input")
    Quit
  }
  Throw $$$ERROR("Unknown error type: "_type)
}

/// A generic `Array#map` utility function.<br>
/// @private<br>
/// @param {Array} array The array to iterate over.<br>
/// @param {Function} callback The function that gets called for every array
/// item.<br>
/// @returns {Array} A new array of values returned by the callback function.
Method Map(list, fnName As %String) As %List [ Private ]
{
  //#Dim result As %ArrayOfDataTypes
  Set output = $LB()
  Set length = $LISTLENGTH(list)
  For i=length:-1:1 
  {
    Set = $LIST(list, i)
    If (fnName = "regexNonASCII")
    {
      Set matcher = ##class(%Regex.Matcher).%New(..#RegexNonASCII, v)
      If (matcher.Locate())
      {
        Set = "xn--"_..Encode(v)
      }
    }
    Elseif (fnName = "regexPunycode")
    {
      Set matcher = ##class(%Regex.Matcher).%New(..#RegexPunycode, v)
      If (matcher.Locate())
      {
        Set = $extract(v, 5, *)
        Set = ..Decode($ZCONVERT(v, "l"))
      }
    }
    Set $LIST(output, i) = v
  }
  Quit output
}

/// A simple `Array#map`-like wrapper to work with domain name strings or email addresses. <br>
/// @private<br>
/// @param {String} domain The domain name or email address.<br>
/// @param {Function} callback The function that gets called for every character.<br>
/// @returns {Array} A new string of characters returned by the callback function.
Method MapDomain(string As %String, fnName As %String) As %String [ Private ]
{
  Set result = ""
  If (string["@") 
  {
    // In email addresses, only the domain name should be punycoded. Leave
    // the local part (i.e. everything up to `@`) intact.
    Set result = $PIECE(string, "@", 1)_"@"
    Set string = $PIECE(string, "@", 2)
  }

  // Avoid `split(regex)` for IE8 compatibility. See #17.
  Set matcher = ##class(%Regex.Matcher).%New(..#RegexSeparators, string)
  Set string = matcher.ReplaceAll($char($ZHEX("2e")))

  Set labels = $LISTFROMSTRING(string, ".")
  Set encoded = $LISTTOSTRING(..Map(labels, fnName), ".")

  Quit result_encoded
}

/// Creates an array containing the numeric code points of each Unicode
/// character in the string. While JavaScript uses UCS-2 internally,
/// this function will convert a pair of surrogate halves (each of which
/// UCS-2 exposes as separate characters) into a single code point,
/// matching UTF-16.<br>
/// @see `Ucs2encode`<br>
/// @param {String} string The Unicode input string (UCS-2).<br>
/// @returns {Array} The new array of code points.
Method Ucs2decode(string As %String) As %ArrayOfDataTypes [ Private ]
{
  Set output = ##class(%ArrayOfDataTypes).%New()
  Set counter = 0
  Set length = $length(string)
  While (counter < length)
  {
    Set counter = counter + 1
    Set value = $ascii($extract(string, counter))
    If ((value >= $zhex("D800")) && (value <= $zhex("DBFF") && (counter < length)))
    {
      Set counter = counter + 1
      Set extra = $ascii($extract(string, counter))
      Set tmp = $zboolean(extra, $zhex("FC00"), 1)
      If (tmp = $zhex("DC00"))
      {
        Set = $zboolean(value, $zhex("3FF"), 1) * (2 ** 10) + $zboolean(extra, $zhex("3FF"), 1) + $zhex("10000")
        Do output.SetAt(a, (output.Count() + 1))
      Else {
        Do output.SetAt(value, (output.Count() + 1)) 
        Set counter = counter - 1
      }
    Else {
      Do output.SetAt(value, (output.Count() + 1)) 
    }
  }
  Quit output
}

/// Creates a string based on an array of numeric code points.<br>
/// @see `Usc2decode`<br>
/// @param {Array} codePoints The array of numeric code points.<br>
/// @returns {String} The new Unicode string (UCS-2).
Method Ucs2encode(array As %ArrayOfDataTypes) As %String [ Private ]
{
  Set output = ""
  Set length = array.Count()
  For i=1:1:length
  {
    Set output = output_$char(array.GetAt(i))
  }
  Quit output
}

/// Converts a basic code point into a digit/integer.
/// @see `DigitToBasic()`<br>
/// @private<br>
/// @param {Number} codePoint The basic numeric code point value.<br>
/// @returns {Number} The numeric value of a basic code point (for use in
/// representing integers) in the range `0` to `base - 1`, or `base` if
/// the code point does not represent a value.
Method BasicToDigit(codePoint) As %Integer [ Private ]
{
  Set res = ..#Base
  If ((codePoint - $zhex("30")) < $zhex("0A"))
  {
    Set res = codePoint - $zhex("16")
  }
  Elseif ((codePoint - $zhex("41")) < $zhex("1A"))
  {
    Set res = codePoint - $zhex("41")
  }
  Elseif ((codePoint - $zhex("61")) < $zhex("1A"))
  {
    Set res = codePoint - $zhex("61")
  }
  Quit res
}

/// Converts a digit/integer into a basic code point.
/// @see `BasicToDigit()`<br>
/// @private<br>
/// @param {Number} digit The numeric value of a basic code point.<br>
/// @returns {Number} The basic code point whose value (when used for
/// representing integers) is `digit`, which needs to be in the range
/// `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
/// used; else, the lowercase form is used. The behavior is undefined
/// if `flag` is non-zero and `digit` has no uppercase form.
Method DigitToBasic(digit, flag) As %Integer [ Private ]
{
  Set result = digit + 22
  If (digit < 26)
  {
    Set result = result + 75
  }
  Set tmp = 0
  If (flag '= 0) 
  {
    Set tmp = 2**5
  }
  Quit result - tmp
}

/// Bias adaptation function as per section 3.4 of RFC 3492.<br>
/// https://tools.ietf.org/html/rfc3492#section-3.4<br>
/// @private
Method Adapt(delta, numPoints, firstTime As %Boolean) As %Integer [ Private ]
{
  Set = 0
  Set delta1 = delta / ..#Damp \ 1
  If ('firstTime)
  {
    Set delta1 = delta / 2 \ 1
  }
  Set delta2 = delta1 / numPoints \ 1
  Set delta1 = delta1 + delta2
  Set baseMinusTMin = ..#Base - ..#TMin

  While (delta1 > ((baseMinusTMin * ..#TMax) / 2 \ 1))
  {
    Set = + ..#Base
    Set delta1 = delta1 / baseMinusTMin \ 1
  }

  Quit (+ ((baseMinusTMin + 1) * delta1 / (delta1 + ..#Skew))) \ 1
}

/// Converts a Punycode string of ASCII-only symbols to a string of Unicode
/// symbols.<br>
/// @param {String} input The Punycode string of ASCII-only symbols.<br>
/// @returns {String} The resulting string of Unicode symbols.
Method Decode(input As %String) As %String [ Private ]
{
  // Don't use UCS-2.
   
  Set output = ##class(%ArrayOfDataTypes).%New()
  Set inputLength = $length(input)
  Set = 0
  Set = ..#InitialN
  Set bias = ..#InitialBias
   
  // Handle the basic code points: let `basic` be the number of input code
  // points before the last delimiter, or `0` if there is none, then copy
  // the first basic code points to the output.
  Set basic = ..LastIndexOf(input, ..#Delimiter) + 1

  For j=1:1:(basic-2)
  {
    Set charcode = $ascii($extract(input, j))
    If (charcode >= $zhex("80"))
    {
      Do ..Error("not-basic")
    }
    Do output.SetAt(charcode, output.Count() + 1)
  }

  // Main decoding loop: start just after the last delimiter if any basic code
  // points were copied; start at the beginning otherwise.
  Set index = 1
  If (basic > 1)
  {
    Set index = basic
  }
  While (index <= inputLength)
  {
    // `index` is the index of the next character to be consumed.
    // Decode a generalized variable-length integer into `delta`,
    // which gets added to `i`. The overflow checking is easier
    // if we increase `i` as we go, then subtract off its starting
    // value at the end to obtain `delta`.

    Set oldi = i
    Set = 1
    Set = ..#Base
    Set con = 1
    While (con > 0)
    {
      If (index > inputLength)
      {
        Do ..Error("invalid-input")
      }

      Set digit = ..BasicToDigit($ascii($extract(input, index)))
      Set index = index + 1

      If ((digit >= ..#Base) || (digit > ((..#MaxInt - i) / \ 1)))
      {
        Do ..Error("overflow 1")
      }

      Set = + (digit * w)

      Set = ..#TMin
      If (> bias)
      {
        If (>= (bias + ..#TMax))
        {
          Set = ..#TMax
        }
        Else
        {
          Set = - bias
        }
      }

      If (digit < t)
      {
        Set con = 0
        quit
      }

      Set baseMinusT = ..#Base - t
      If (> (..#MaxInt / baseMinusT \ 1))
      {
        Do ..Error("overflow 2")
      }

      Set = * baseMinusT
      Set = + ..#Base
    }
    Set out = output.Count() + 1

    Set bias = ..Adapt((- oldi), out, (oldi=0))

    // `i` was supposed to wrap around from `out` to `0`,
    // incrementing `n` each time, so we'll fix that now:
    If ((/ out \ 1) > (..#MaxInt - n))
    {
      Do ..Error("overflow 3")
    }

    Set = + (/ out \ 1)
    Set = - (out * (/ out \ 1))

    Set outcount = output.Count()
    For id=outcount:-1:i+1
    {
      Do output.SetAt(output.GetAt(id), id+1) 
    }
    Do output.SetAt(n, i+1)
    Set = + 1
  }

  Set res = ""
  Set outcount = output.Count()
  For i=1:1:outcount
  {
    Set = $char(output.GetAt(i))
    Set res = res_c
  }
  Quit res
}

/// Converts a string of Unicode symbols (e.g. a domain name label) to a
/// Punycode string of ASCII-only symbols.<br>
/// @param {String} input The string of Unicode symbols.<br>
/// @returns {String} The resulting Punycode string of ASCII-only symbols.
Method Encode(input As %String) As %String [ Private ]
{
  Set output = ##class(%ArrayOfDataTypes).%New()

  // Convert the input in UCS-2 to an array of Unicode code points.
  Set inputCodes = ..Ucs2decode(input)

  // Cache the length.
  Set inputLength = inputCodes.Count()

  // Initialize the state.
  Set = ..#InitialN
  Set delta = 0
  Set bias = ..#InitialBias

  // Handle the basic code points.
  For i=1:1:inputLength
  {
    Set currentValue = inputCodes.GetAt(i)
    If (currentValue < $zhex("80"))
    {
      Do output.SetAt($char(currentValue), output.Count() + 1)
    }
  }

  Set basicLength = output.Count()
  Set handledCPCount = basicLength

  // `handledCPCount` is the number of code points that have been handled;
  // `basicLength` is the number of basic code points. 

  // Finish the basic string with a delimiter unless it's empty.
  If (basicLength > 0)
  {
    Do output.SetAt(..#Delimiter, output.Count() + 1)
  }

  // Main encoding loop
  While (handledCPCount < inputLength)
  {
    // All non-basic code points < n have been handled already. Find the next
    // larger one:
    Set = ..#MaxInt
    For i=1:1:inputLength
    {
      Set currentValue = inputCodes.GetAt(i)

      If ((currentValue >= n) && (currentValue < m))
      {
        Set = currentValue
      }
    }

    // Increase `delta` enough to advance the decoder's <n,i> state to <m,0>,
    // but guard against overflow.
    Set handledCPCountPlusOne = handledCPCount + 1
    If ((m-n) > ((..#MaxInt - delta) / handledCPCountPlusOne \ 1))
    {
    Do ..Error("overflow 4")
    }

    Set delta = delta + ((- n) * handledCPCountPlusOne)
    Set = m

    For i=1:1:inputLength
    {
      Set currentValue = inputCodes.GetAt(i)

      If (currentValue < n)
      {
        Set delta = delta + 1
        If (delta > ..#MaxInt)
        {
          Do ..Error("overflow 5")
        }
      }

      If (currentValue = n)
      {
        // Represent delta as a generalized variable-length integer.
        Set = delta
        Set = ..#Base
        Set con = 1
        While (con > 0)
        {
          Set = ..#TMin
          If (> bias)
          {
            If (>= (bias + ..#TMax))
            {
              Set = ..#TMax
            }
            Else
            {
              Set = - bias
            }
          }

          If (< t)
          {
            Set con = 0
            quit
          }
          Set qMinusT = - t
          Set baseMinusT = ..#Base - t

          Set qMinusTModeBaseMinusT = qMinusT - ((qMinusT / baseMinusT \ 1) * baseMinusT)

          Do output.SetAt($char(..DigitToBasic(+ qMinusTModeBaseMinusT, 0)), output.Count() + 1)

          Set = qMinusT / baseMinusT \ 1

          Set = + ..#Base
        }
        Do output.SetAt($char(..DigitToBasic(q, 0)), output.Count() + 1)
        Set bias = ..Adapt(delta, handledCPCountPlusOne, handledCPCount = basicLength)
        Set delta = 0
        Set handledCPCount = handledCPCount + 1
      }
    }
    Set delta = delta + 1
    Set = + 1
  }
  Set res = ""
  For i=1:1:output.Count()
  {
    Set res = res_output.GetAt(i)
  }
  Quit res
}

/// Find the position of the last occurrence of a substring in a string.<br>
/// @param {String} str Specifies the string to search<br>
/// @param {String} substr Specifies the string to find<br>
/// @returns {Ingeger} the last position
Method LastIndexOf(str As %String, substr As %String) As %Integer [ Private ]
{
  Set pos = 0
  Set tmpstr = str
  Set lengthSub = $length(substr)
  Do {
    Set index = $find(tmpstr, substr)
    If (index > 0)
    {
      Set pos = pos + index - lengthSub
      Set tmpstr = $extract(tmpstr, index, *)
    }
  While (index > 0)

  If (pos < 0)
  {
    Set pos = 0
  }
  Quit pos
}

/// Converts a Punycode string representing a domain name or an email address
/// to Unicode. Only the Punycoded parts of the input will be converted, i.e.
/// it doesn't matter if you call it on a string that has already been
/// converted to Unicode.<br>
/// @param {String} input The Punycoded domain name or email address to
/// convert to Unicode.<br>
/// @returns {String} The Unicode representation of the given Punycode
/// string.
ClassMethod ToUnicode(input As %String = "") As %String
{
  Set ReturnValue=""
  If ($L(input)>0)
  {
    Set Punycode=##class(Util.Punycode).%New()
    Set ReturnValue=Punycode.MapDomain(input, "regexPunycode") 
  
  Quit ReturnValue
}

/// Converts a Unicode string representing a domain name or an email address to
/// Punycode. Only the non-ASCII parts of the domain name will be converted,
/// i.e. it doesn't matter if you call it with a domain that's already in
/// ASCII.<br>
/// @param {String} input The domain name or email address to convert, as a
/// Unicode string.<br>
/// @returns {String} The Punycode representation of the given domain name or
/// email address.
ClassMethod ToASCII(input As %String = "") As %String
{
  Set ReturnValue = ""
  If ($L(input)>0)
  {
    Set Punycode=##class(Util.Punycode).%New()
    Set ReturnValue = Punycode.MapDomain(input, "regexNonASCII")
  }
  Quit ReturnValue
}

}
4
0 719
Discussion (5)6
Log in or sign up to continue

Awesome! Can you put it on Github?

Great thing! Yes, please, upload it on Github?

test in Terminal:

w ##class(Util.Punycode).ToASCII("test@umlautprüfung.de")
​>> test@xn--umlautprfung-klb.de

 

w ##class(Util.Punycode).ToUnicode("test@xn--umlautprfung-klb.de")
​>> test@umlautprüfung.de

Nice. I almost have the same code but mine is not working :/

One aspect of punycode encoding is that Chrome is (since v56) encoding data submitted from input type='email' fields with punycode. Not a big deal, you might think but consider people signing up for you service using chrome, data gets stored as punycode and the user will never be able to login from any other browser because the submitted value will not match the stored value!

 

See https://justmarkup.com/log/2015/02/input-type-email-better-dont-use-it/

 

Fred