11.8 C
London
Friday, February 9, 2024

json – How do I decode HTML entities in Swift?


@akashivskyy’s reply is nice and demonstrates how one can make the most of NSAttributedString to decode HTML entities. One attainable drawback
(as he acknowledged) is that all HTML markup is eliminated as effectively, so

<sturdy> 4 &lt; 5 &amp; 3 &gt; 2</sturdy>

turns into

4 < 5 & 3 > 2

On OS X there’s CFXMLCreateStringByUnescapingEntities() which does the job:

let encoded = "<sturdy> 4 &lt; 5 &amp; 3 &gt; 2 .</sturdy> Worth: 12 &#x20ac;.  &#64; "
let decoded = CFXMLCreateStringByUnescapingEntities(nil, encoded, nil) as String
println(decoded)
// <sturdy> 4 < 5 & 3 > 2 .</sturdy> Worth: 12 €.  @ 

however this isn’t obtainable on iOS.

Here’s a pure Swift implementation. It decodes character entities
references like &lt; utilizing a dictionary, and all numeric character
entities like &#64 or &#x20ac. (Word that I didn’t record all
252 HTML entities explicitly.)

Swift 4:

// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
non-public let characterEntities : [ Substring : Character ] = [
    // XML predefined entities:
    "&quot;"    : """,
    "&amp;"     : "&",
    "&apos;"    : "'",
    "&lt;"      : "<",
    "&gt;"      : ">",

    // HTML character entity references:
    "&nbsp;"    : "u{00a0}",
    // ...
    "&diams;"   : "♦",
]

extension String {

    /// Returns a brand new string made by changing within the `String`
    /// all HTML character entity references with the corresponding
    /// character.
    var stringByDecodingHTMLEntities : String {

        // ===== Utility features =====

        // Convert the quantity within the string to the corresponding
        // Unicode character, e.g.
        //    decodeNumeric("64", 10)   --> "@"
        //    decodeNumeric("20ac", 16) --> "€"
        func decodeNumeric(_ string : Substring, base : Int) -> Character? {
            guard let code = UInt32(string, radix: base),
                let uniScalar = UnicodeScalar(code) else { return nil }
            return Character(uniScalar)
        }

        // Decode the HTML character entity to the corresponding
        // Unicode character, return `nil` for invalid enter.
        //     decode("&#64;")    --> "@"
        //     decode("&#x20ac;") --> "€"
        //     decode("&lt;")     --> "<"
        //     decode("&foo;")    --> nil
        func decode(_ entity : Substring) -> Character? {

            if entity.hasPrefix("&#x") || entity.hasPrefix("&#X") {
                return decodeNumeric(entity.dropFirst(3).dropLast(), base: 16)
            } else if entity.hasPrefix("&#") {
                return decodeNumeric(entity.dropFirst(2).dropLast(), base: 10)
            } else {
                return characterEntities[entity]
            }
        }

        // ===== Methodology begins right here =====

        var outcome = ""
        var place = startIndex

        // Discover the subsequent '&' and duplicate the characters previous it to `outcome`:
        whereas let ampRange = self[position...].vary(of: "&") {
            outcome.append(contentsOf: self[position ..< ampRange.lowerBound])
            place = ampRange.lowerBound

            // Discover the subsequent ';' and duplicate all the things from '&' to ';' into `entity`
            guard let semiRange = self[position...].vary(of: ";") else {
                // No matching ';'.
                break
            }
            let entity = self[position ..< semiRange.upperBound]
            place = semiRange.upperBound

            if let decoded = decode(entity) {
                // Change by decoded character:
                outcome.append(decoded)
            } else {
                // Invalid entity, copy verbatim:
                outcome.append(contentsOf: entity)
            }
        }
        // Copy remaining characters to `outcome`:
        outcome.append(contentsOf: self[position...])
        return outcome
    }
}

Instance:

let encoded = "<sturdy> 4 &lt; 5 &amp; 3 &gt; 2 .</sturdy> Worth: 12 &#x20ac;.  &#64; "
let decoded = encoded.stringByDecodingHTMLEntities
print(decoded)
// <sturdy> 4 < 5 & 3 > 2 .</sturdy> Worth: 12 €.  @

Swift 3:

// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
non-public let characterEntities : [ String : Character ] = [
    // XML predefined entities:
    "&quot;"    : """,
    "&amp;"     : "&",
    "&apos;"    : "'",
    "&lt;"      : "<",
    "&gt;"      : ">",

    // HTML character entity references:
    "&nbsp;"    : "u{00a0}",
    // ...
    "&diams;"   : "♦",
]

extension String {

    /// Returns a brand new string made by changing within the `String`
    /// all HTML character entity references with the corresponding
    /// character.
    var stringByDecodingHTMLEntities : String {

        // ===== Utility features =====

        // Convert the quantity within the string to the corresponding
        // Unicode character, e.g.
        //    decodeNumeric("64", 10)   --> "@"
        //    decodeNumeric("20ac", 16) --> "€"
        func decodeNumeric(_ string : String, base : Int) -> Character? {
            guard let code = UInt32(string, radix: base),
                let uniScalar = UnicodeScalar(code) else { return nil }
            return Character(uniScalar)
        }

        // Decode the HTML character entity to the corresponding
        // Unicode character, return `nil` for invalid enter.
        //     decode("&#64;")    --> "@"
        //     decode("&#x20ac;") --> "€"
        //     decode("&lt;")     --> "<"
        //     decode("&foo;")    --> nil
        func decode(_ entity : String) -> Character? {

            if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
                return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 3) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 16)
            } else if entity.hasPrefix("&#") {
                return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 2) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 10)
            } else {
                return characterEntities[entity]
            }
        }

        // ===== Methodology begins right here =====

        var outcome = ""
        var place = startIndex

        // Discover the subsequent '&' and duplicate the characters previous it to `outcome`:
        whereas let ampRange = self.vary(of: "&", vary: place ..< endIndex) {
            outcome.append(self[position ..< ampRange.lowerBound])
            place = ampRange.lowerBound

            // Discover the subsequent ';' and duplicate all the things from '&' to ';' into `entity`
            if let semiRange = self.vary(of: ";", vary: place ..< endIndex) {
                let entity = self[position ..< semiRange.upperBound]
                place = semiRange.upperBound

                if let decoded = decode(entity) {
                    // Change by decoded character:
                    outcome.append(decoded)
                } else {
                    // Invalid entity, copy verbatim:
                    outcome.append(entity)
                }
            } else {
                // No matching ';'.
                break
            }
        }
        // Copy remaining characters to `outcome`:
        outcome.append(self[position ..< endIndex])
        return outcome
    }
}

Swift 2:

// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
non-public let characterEntities : [ String : Character ] = [
    // XML predefined entities:
    "&quot;"    : """,
    "&amp;"     : "&",
    "&apos;"    : "'",
    "&lt;"      : "<",
    "&gt;"      : ">",

    // HTML character entity references:
    "&nbsp;"    : "u{00a0}",
    // ...
    "&diams;"   : "♦",
]

extension String {

    /// Returns a brand new string made by changing within the `String`
    /// all HTML character entity references with the corresponding
    /// character.
    var stringByDecodingHTMLEntities : String {

        // ===== Utility features =====

        // Convert the quantity within the string to the corresponding
        // Unicode character, e.g.
        //    decodeNumeric("64", 10)   --> "@"
        //    decodeNumeric("20ac", 16) --> "€"
        func decodeNumeric(string : String, base : Int32) -> Character? {
            let code = UInt32(strtoul(string, nil, base))
            return Character(UnicodeScalar(code))
        }

        // Decode the HTML character entity to the corresponding
        // Unicode character, return `nil` for invalid enter.
        //     decode("&#64;")    --> "@"
        //     decode("&#x20ac;") --> "€"
        //     decode("&lt;")     --> "<"
        //     decode("&foo;")    --> nil
        func decode(entity : String) -> Character? {

            if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
                return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(3)), base: 16)
            } else if entity.hasPrefix("&#") {
                return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(2)), base: 10)
            } else {
                return characterEntities[entity]
            }
        }

        // ===== Methodology begins right here =====

        var outcome = ""
        var place = startIndex

        // Discover the subsequent '&' and duplicate the characters previous it to `outcome`:
        whereas let ampRange = self.rangeOfString("&", vary: place ..< endIndex) {
            outcome.appendContentsOf(self[position ..< ampRange.startIndex])
            place = ampRange.startIndex

            // Discover the subsequent ';' and duplicate all the things from '&' to ';' into `entity`
            if let semiRange = self.rangeOfString(";", vary: place ..< endIndex) {
                let entity = self[position ..< semiRange.endIndex]
                place = semiRange.endIndex

                if let decoded = decode(entity) {
                    // Change by decoded character:
                    outcome.append(decoded)
                } else {
                    // Invalid entity, copy verbatim:
                    outcome.appendContentsOf(entity)
                }
            } else {
                // No matching ';'.
                break
            }
        }
        // Copy remaining characters to `outcome`:
        outcome.appendContentsOf(self[position ..< endIndex])
        return outcome
    }
}

Latest news
Related news

LEAVE A REPLY

Please enter your comment!
Please enter your name here