// // HtmlTreeBuilder.swift // SwiftSoup // // Created by Nabil Chatbi on 24/10/16. // Copyright © 2016 Nabil Chatbi.. All rights reserved. // import Foundation /** * HTML Tree Builder; creates a DOM from Tokens. */ class HtmlTreeBuilder: TreeBuilder { // tag searches public static let TagsSearchInScope: [String] = ["applet", "caption", "html", "table", "td", "th", "marquee", "object"] private static let TagSearchList: [String] = ["ol", "ul"] private static let TagSearchButton: [String] = ["button"] private static let TagSearchTableScope: [String] = ["html", "table"] private static let TagSearchSelectScope: [String] = ["optgroup", "option"] private static let TagSearchEndTags: [String] = ["dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"] private static let TagSearchSpecial: [String] = ["address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd", "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "ul", "wbr", "xmp"] private var _state: HtmlTreeBuilderState = HtmlTreeBuilderState.Initial // the current state private var _originalState: HtmlTreeBuilderState = HtmlTreeBuilderState.Initial // original / marked state private var baseUriSetFromDoc: Bool = false private var headElement: Element? // the current head element private var formElement: FormElement? // the current form element private var contextElement: Element? // fragment parse context -- could be null even if fragment parsing private var formattingElements: Array = Array() // active (open) formatting elements private var pendingTableCharacters: Array = Array() // chars in table to be shifted out private var emptyEnd: Token.EndTag = Token.EndTag() // reused empty end tag private var _framesetOk: Bool = true // if ok to go into frameset private var fosterInserts: Bool = false // if next inserts should be fostered private var fragmentParsing: Bool = false // if parsing a fragment of html public override init() { super.init() } public override func defaultSettings() -> ParseSettings { return ParseSettings.htmlDefault } override func parse(_ input: String, _ baseUri: String, _ errors: ParseErrorList, _ settings: ParseSettings)throws->Document { _state = HtmlTreeBuilderState.Initial baseUriSetFromDoc = false return try super.parse(input, baseUri, errors, settings) } func parseFragment(_ inputFragment: String, _ context: Element?, _ baseUri: String, _ errors: ParseErrorList, _ settings: ParseSettings)throws->Array { // context may be null _state = HtmlTreeBuilderState.Initial initialiseParse(inputFragment, baseUri, errors, settings) contextElement = context fragmentParsing = true var root: Element? = nil if let context = context { if let d = context.ownerDocument() { // quirks setup: doc.quirksMode(d.quirksMode()) } // initialise the tokeniser state: let contextTag: String = context.tagName() if (StringUtil.inString(contextTag, haystack: "title", "textarea")) { tokeniser.transition(TokeniserState.Rcdata) } else if (StringUtil.inString(contextTag, haystack: "iframe", "noembed", "noframes", "style", "xmp")) { tokeniser.transition(TokeniserState.Rawtext) } else if (contextTag=="script") { tokeniser.transition(TokeniserState.ScriptData) } else if (contextTag==("noscript")) { tokeniser.transition(TokeniserState.Data) // if scripting enabled, rawtext } else if (contextTag=="plaintext") { tokeniser.transition(TokeniserState.Data) } else { tokeniser.transition(TokeniserState.Data) // default } root = try Element(Tag.valueOf("html", settings), baseUri) try Validate.notNull(obj: root) try doc.appendChild(root!) stack.append(root!) resetInsertionMode() // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated // with form correctly let contextChain: Elements = context.parents() contextChain.add(0, context) for parent: Element in contextChain.array() { if let x = (parent as? FormElement) { formElement = x break } } } try runParser() if (context != nil && root != nil) { return root!.getChildNodes() } else { return doc.getChildNodes() } } @discardableResult public override func process(_ token: Token)throws->Bool { currentToken = token return try self._state.process(token, self) } @discardableResult func process(_ token: Token, _ state: HtmlTreeBuilderState)throws->Bool { currentToken = token return try state.process(token, self) } func transition(_ state: HtmlTreeBuilderState) { self._state = state } func state() -> HtmlTreeBuilderState { return _state } func markInsertionMode() { _originalState = _state } func originalState() -> HtmlTreeBuilderState { return _originalState } func framesetOk(_ framesetOk: Bool) { self._framesetOk = framesetOk } func framesetOk() -> Bool { return _framesetOk } func getDocument() -> Document { return doc } func getBaseUri() -> String { return baseUri } func maybeSetBaseUri(_ base: Element)throws { if (baseUriSetFromDoc) { // only listen to the first in parse return } let href: String = try base.absUrl("href") if (href.count != 0) { // ignore etc baseUri = href baseUriSetFromDoc = true try doc.setBaseUri(href) // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants } } func isFragmentParsing() -> Bool { return fragmentParsing } func error(_ state: HtmlTreeBuilderState) { if (errors.canAddError() && currentToken != nil) { errors.add(ParseError(reader.getPos(), "Unexpected token [\(currentToken!.tokenType())] when in state [\(state.rawValue)]")) } } @discardableResult func insert(_ startTag: Token.StartTag)throws->Element { // handle empty unknown tags // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. if (startTag.isSelfClosing()) { let el: Element = try insertEmpty(startTag) stack.append(el) tokeniser.transition(TokeniserState.Data) // handles