//
// HtmlTreeBuilderState.swift
// SwiftSoup
//
// Created by Nabil Chatbi on 24/10/16.
// Copyright © 2016 Nabil Chatbi.. All rights reserved.
//
import Foundation
protocol HtmlTreeBuilderStateProtocol {
func process(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool
}
enum HtmlTreeBuilderState: String, HtmlTreeBuilderStateProtocol {
case Initial
case BeforeHtml
case BeforeHead
case InHead
case InHeadNoscript
case AfterHead
case InBody
case Text
case InTable
case InTableText
case InCaption
case InColumnGroup
case InTableBody
case InRow
case InCell
case InSelect
case InSelectInTable
case AfterBody
case InFrameset
case AfterFrameset
case AfterAfterBody
case AfterAfterFrameset
case ForeignContent
private static let nullString: String = "\u{0000}"
public func equals(_ s: HtmlTreeBuilderState) -> Bool {
return self.hashValue == s.hashValue
}
func process(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool {
switch self {
case .Initial:
if (HtmlTreeBuilderState.isWhitespace(t)) {
return true // ignore whitespace
} else if (t.isComment()) {
try tb.insert(t.asComment())
} else if (t.isDoctype()) {
// todo: parse error check on expected doctypes
// todo: quirk state check on doctype ids
let d: Token.Doctype = t.asDoctype()
let doctype: DocumentType = DocumentType(
tb.settings.normalizeTag(d.getName()), d.getPubSysKey(), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri())
//tb.settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri())
try tb.getDocument().appendChild(doctype)
if (d.isForceQuirks()) {
tb.getDocument().quirksMode(Document.QuirksMode.quirks)
}
tb.transition(.BeforeHtml)
} else {
// todo: check not iframe srcdoc
tb.transition(.BeforeHtml)
return try tb.process(t) // re-process token
}
return true
case .BeforeHtml:
func anythingElse(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool {
try tb.insertStartTag("html")
tb.transition(.BeforeHead)
return try tb.process(t)
}
if (t.isDoctype()) {
tb.error(self)
return false
} else if (t.isComment()) {
try tb.insert(t.asComment())
} else if (HtmlTreeBuilderState.isWhitespace(t)) {
return true // ignore whitespace
} else if (t.isStartTag() && (t.asStartTag().normalName()?.equals("html"))!) {
try tb.insert(t.asStartTag())
tb.transition(.BeforeHead)
} else if (t.isEndTag() && (StringUtil.inString(t.asEndTag().normalName()!, haystack: "head", "body", "html", "br"))) {
return try anythingElse(t, tb)
} else if (t.isEndTag()) {
tb.error(self)
return false
} else {
return try anythingElse(t, tb)
}
return true
case .BeforeHead:
if (HtmlTreeBuilderState.isWhitespace(t)) {
return true
} else if (t.isComment()) {
try tb.insert(t.asComment())
} else if (t.isDoctype()) {
tb.error(self)
return false
} else if (t.isStartTag() && (t.asStartTag().normalName()?.equals("html"))!) {
return try HtmlTreeBuilderState.InBody.process(t, tb) // does not transition
} else if (t.isStartTag() && (t.asStartTag().normalName()?.equals("head"))!) {
let head: Element = try tb.insert(t.asStartTag())
tb.setHeadElement(head)
tb.transition(.InHead)
} else if (t.isEndTag() && (StringUtil.inString(t.asEndTag().normalName()!, haystack: "head", "body", "html", "br"))) {
try tb.processStartTag("head")
return try tb.process(t)
} else if (t.isEndTag()) {
tb.error(self)
return false
} else {
try tb.processStartTag("head")
return try tb.process(t)
}
return true
case .InHead:
func anythingElse(_ t: Token, _ tb: TreeBuilder)throws->Bool {
try tb.processEndTag("head")
return try tb.process(t)
}
if (HtmlTreeBuilderState.isWhitespace(t)) {
try tb.insert(t.asCharacter())
return true
}
switch (t.type) {
case .Comment:
try tb.insert(t.asComment())
break
case .Doctype:
tb.error(self)
return false
case .StartTag:
let start: Token.StartTag = t.asStartTag()
var name: String = start.normalName()!
if (name.equals("html")) {
return try HtmlTreeBuilderState.InBody.process(t, tb)
} else if (StringUtil.inString(name, haystack: "base", "basefont", "bgsound", "command", "link")) {
let el: Element = try tb.insertEmpty(start)
// jsoup special: update base the frist time it is seen
if (name.equals("base") && el.hasAttr("href")) {
try tb.maybeSetBaseUri(el)
}
} else if (name.equals("meta")) {
let meta: Element = try tb.insertEmpty(start)
// todo: charset switches
} else if (name.equals("title")) {
try HtmlTreeBuilderState.handleRcData(start, tb)
} else if (StringUtil.inString(name, haystack: "noframes", "style")) {
try HtmlTreeBuilderState.handleRawtext(start, tb)
} else if (name.equals("noscript")) {
// else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript)
try tb.insert(start)
tb.transition(.InHeadNoscript)
} else if (name.equals("script")) {
// skips some script rules as won't execute them
tb.tokeniser.transition(TokeniserState.ScriptData)
tb.markInsertionMode()
tb.transition(.Text)
try tb.insert(start)
} else if (name.equals("head")) {
tb.error(self)
return false
} else {
return try anythingElse(t, tb)
}
break
case .EndTag:
let end: Token.EndTag = t.asEndTag()
let name = end.normalName()
if (name?.equals("head"))! {
tb.pop()
tb.transition(.AfterHead)
} else if (name != nil && StringUtil.inString(name!, haystack: "body", "html", "br")) {
return try anythingElse(t, tb)
} else {
tb.error(self)
return false
}
break
default:
return try anythingElse(t, tb)
}
return true
case .InHeadNoscript:
func anythingElse(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool {
tb.error(self)
try tb.insert(Token.Char().data(t.toString()))
return true
}
if (t.isDoctype()) {
tb.error(self)
} else if (t.isStartTag() && (t.asStartTag().normalName()?.equals("html"))!) {
return try tb.process(t, .InBody)
} else if (t.isEndTag() && (t.asEndTag().normalName()?.equals("noscript"))!) {
tb.pop()
tb.transition(.InHead)
} else if (HtmlTreeBuilderState.isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.inString(t.asStartTag().normalName()!,
haystack: "basefont", "bgsound", "link", "meta", "noframes", "style"))) {
return try tb.process(t, .InHead)
} else if (t.isEndTag() && (t.asEndTag().normalName()?.equals("br"))!) {
return try anythingElse(t, tb)
} else if ((t.isStartTag() && StringUtil.inString(t.asStartTag().normalName()!, haystack: "head", "noscript")) || t.isEndTag()) {
tb.error(self)
return false
} else {
return try anythingElse(t, tb)
}
return true
case .AfterHead:
@discardableResult
func anythingElse(_ t: Token, _ tb: HtmlTreeBuilder)throws->Bool {
try tb.processStartTag("body")
tb.framesetOk(true)
return try tb.process(t)
}
if (HtmlTreeBuilderState.isWhitespace(t)) {
try tb.insert(t.asCharacter())
} else if (t.isComment()) {
try tb.insert(t.asComment())
} else if (t.isDoctype()) {
tb.error(self)
} else if (t.isStartTag()) {
let startTag: Token.StartTag = t.asStartTag()
let name: String = startTag.normalName()!
if (name.equals("html")) {
return try tb.process(t, .InBody)
} else if (name.equals("body")) {
try tb.insert(startTag)
tb.framesetOk(false)
tb.transition(.InBody)
} else if (name.equals("frameset")) {
try tb.insert(startTag)
tb.transition(.InFrameset)
} else if (StringUtil.inString(name, haystack: "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) {
tb.error(self)
let head: Element = tb.getHeadElement()!
tb.push(head)
try tb.process(t, .InHead)
tb.removeFromStack(head)
} else if (name.equals("head")) {
tb.error(self)
return false
} else {
try anythingElse(t, tb)
}
} else if (t.isEndTag()) {
if (StringUtil.inString(t.asEndTag().normalName()!, haystack: "body", "html")) {
try anythingElse(t, tb)
} else {
tb.error(self)
return false
}
} else {
try anythingElse(t, tb)
}
return true
case .InBody:
func anyOtherEndTag(_ t: Token, _ tb: HtmlTreeBuilder) -> Bool {
let name: String? = t.asEndTag().normalName()
let stack: Array, unless in svg
} else {
try tb.insert(startTag)
}
} else if (name.equals("isindex")) {
// how much do we care about the early 90s?
tb.error(self)
if (tb.getFormElement() != nil) {
return false
}
tb.tokeniser.acknowledgeSelfClosingFlag()
try tb.processStartTag("form")
if (startTag._attributes.hasKey(key: "action")) {
if let form: Element = tb.getFormElement() {
try form.attr("action", startTag._attributes.get(key: "action"))
}
}
try tb.processStartTag("hr")
try tb.processStartTag("label")
// hope you like english.
let prompt: String = startTag._attributes.hasKey(key: "prompt") ?
startTag._attributes.get(key: "prompt") :
"self is a searchable index. Enter search keywords: "
try tb.process(Token.Char().data(prompt))
// input
let inputAttribs: Attributes = Attributes()
for attr: Attribute in startTag._attributes {
if (!Constants.InBodyStartInputAttribs.contains(attr.getKey())) {
inputAttribs.put(attribute: attr)
}
}
try inputAttribs.put("name", "isindex")
try tb.processStartTag("input", inputAttribs)
try tb.processEndTag("label")
try tb.processStartTag("hr")
try tb.processEndTag("form")
} else if (name.equals("textarea")) {
try tb.insert(startTag)
// todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.)
tb.tokeniser.transition(TokeniserState.Rcdata)
tb.markInsertionMode()
tb.framesetOk(false)
tb.transition(.Text)
} else if (name.equals("xmp")) {
if (try tb.inButtonScope("p")) {
try tb.processEndTag("p")
}
try tb.reconstructFormattingElements()
tb.framesetOk(false)
try HtmlTreeBuilderState.handleRawtext(startTag, tb)
} else if (name.equals("iframe")) {
tb.framesetOk(false)
try HtmlTreeBuilderState.handleRawtext(startTag, tb)
} else if (name.equals("noembed")) {
// also handle noscript if script enabled
try HtmlTreeBuilderState.handleRawtext(startTag, tb)
} else if (name.equals("select")) {
try tb.reconstructFormattingElements()
try tb.insert(startTag)
tb.framesetOk(false)
let state: HtmlTreeBuilderState = tb.state()
if (state.equals(.InTable) || state.equals(.InCaption) || state.equals(.InTableBody) || state.equals(.InRow) || state.equals(.InCell)) {
tb.transition(.InSelectInTable)
} else {
tb.transition(.InSelect)
}
} else if Constants.InBodyStartOptions.contains(name) {
if (tb.currentElement() != nil && tb.currentElement()!.nodeName().equals("option")) {
try tb.processEndTag("option")
}
try tb.reconstructFormattingElements()
try tb.insert(startTag)
} else if Constants.InBodyStartRuby.contains(name) {
if (try tb.inScope("ruby")) {
tb.generateImpliedEndTags()
if (tb.currentElement() != nil && !tb.currentElement()!.nodeName().equals("ruby")) {
tb.error(self)
tb.popStackToBefore("ruby") // i.e. close up to but not include name
}
try tb.insert(startTag)
}
} else if (name.equals("math")) {
try tb.reconstructFormattingElements()
// todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml)
try tb.insert(startTag)
tb.tokeniser.acknowledgeSelfClosingFlag()
} else if (name.equals("svg")) {
try tb.reconstructFormattingElements()
// todo: handle A start tag whose tag name is "svg" (xlink, svg)
try tb.insert(startTag)
tb.tokeniser.acknowledgeSelfClosingFlag()
} else if Constants.InBodyStartDrop.contains(name) {
tb.error(self)
return false
} else {
try tb.reconstructFormattingElements()
try tb.insert(startTag)
}
} else {
try tb.reconstructFormattingElements()
try tb.insert(startTag)
}
break
case .EndTag:
let endTag: Token.EndTag = t.asEndTag()
if let name = endTag.normalName() {
if Constants.InBodyEndAdoptionFormatters.contains(name) {
// Adoption Agency Algorithm.
for i in 0..<8 {
let formatEl: Element? = tb.getActiveFormattingElement(name)
if (formatEl == nil) {
return anyOtherEndTag(t, tb)
} else if (!tb.onStack(formatEl!)) {
tb.error(self)
tb.removeFromActiveFormattingElements(formatEl!)
return true
} else if (try !tb.inScope(formatEl!.nodeName())) {
tb.error(self)
return false
} else if (tb.currentElement() != formatEl!) {
tb.error(self)
}
var furthestBlock: Element? = nil
var commonAncestor: Element? = nil
var seenFormattingElement: Bool = false
let stack: Array