";
+ };
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+ 629A63C81D9AFE0E0089679F /* HTMLKitExample */ = {
+ isa = PBXNativeTarget;
+ buildConfigurationList = 629A63D01D9AFE0E0089679F /* Build configuration list for PBXNativeTarget "HTMLKitExample" */;
+ buildPhases = (
+ 629A63C51D9AFE0E0089679F /* Sources */,
+ 629A63C61D9AFE0E0089679F /* Frameworks */,
+ 629A63C71D9AFE0E0089679F /* CopyFiles */,
+ );
+ buildRules = (
+ );
+ dependencies = (
+ );
+ name = HTMLKitExample;
+ productName = HTMLKitExample;
+ productReference = 629A63C91D9AFE0E0089679F /* HTMLKitExample */;
+ productType = "com.apple.product-type.tool";
+ };
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+ 629A63C11D9AFE0E0089679F /* Project object */ = {
+ isa = PBXProject;
+ attributes = {
+ LastSwiftUpdateCheck = 0800;
+ LastUpgradeCheck = 0800;
+ ORGANIZATIONNAME = iabudiab;
+ TargetAttributes = {
+ 629A63C81D9AFE0E0089679F = {
+ CreatedOnToolsVersion = 8.0;
+ ProvisioningStyle = Automatic;
+ };
+ };
+ };
+ buildConfigurationList = 629A63C41D9AFE0E0089679F /* Build configuration list for PBXProject "HTMLKitExample" */;
+ compatibilityVersion = "Xcode 3.2";
+ developmentRegion = English;
+ hasScannedForEncodings = 0;
+ knownRegions = (
+ en,
+ );
+ mainGroup = 629A63C01D9AFE0E0089679F;
+ productRefGroup = 629A63CA1D9AFE0E0089679F /* Products */;
+ projectDirPath = "";
+ projectRoot = "";
+ targets = (
+ 629A63C81D9AFE0E0089679F /* HTMLKitExample */,
+ );
+ };
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+ 629A63C51D9AFE0E0089679F /* Sources */ = {
+ isa = PBXSourcesBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ 629A63CD1D9AFE0E0089679F /* main.swift in Sources */,
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+ 629A63CE1D9AFE0E0089679F /* Debug */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ CLANG_ANALYZER_NONNULL = YES;
+ CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+ CLANG_CXX_LIBRARY = "libc++";
+ CLANG_ENABLE_MODULES = YES;
+ CLANG_ENABLE_OBJC_ARC = YES;
+ CLANG_WARN_BOOL_CONVERSION = YES;
+ CLANG_WARN_CONSTANT_CONVERSION = YES;
+ CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+ CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+ CLANG_WARN_EMPTY_BODY = YES;
+ CLANG_WARN_ENUM_CONVERSION = YES;
+ CLANG_WARN_INFINITE_RECURSION = YES;
+ CLANG_WARN_INT_CONVERSION = YES;
+ CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+ CLANG_WARN_SUSPICIOUS_MOVES = YES;
+ CLANG_WARN_UNREACHABLE_CODE = YES;
+ CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+ CODE_SIGN_IDENTITY = "-";
+ COPY_PHASE_STRIP = NO;
+ DEBUG_INFORMATION_FORMAT = dwarf;
+ ENABLE_STRICT_OBJC_MSGSEND = YES;
+ ENABLE_TESTABILITY = YES;
+ GCC_C_LANGUAGE_STANDARD = gnu99;
+ GCC_DYNAMIC_NO_PIC = NO;
+ GCC_NO_COMMON_BLOCKS = YES;
+ GCC_OPTIMIZATION_LEVEL = 0;
+ GCC_PREPROCESSOR_DEFINITIONS = (
+ "DEBUG=1",
+ "$(inherited)",
+ );
+ GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+ GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+ GCC_WARN_UNDECLARED_SELECTOR = YES;
+ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+ GCC_WARN_UNUSED_FUNCTION = YES;
+ GCC_WARN_UNUSED_VARIABLE = YES;
+ MACOSX_DEPLOYMENT_TARGET = 10.11;
+ MTL_ENABLE_DEBUG_INFO = YES;
+ ONLY_ACTIVE_ARCH = YES;
+ SDKROOT = macosx;
+ SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+ };
+ name = Debug;
+ };
+ 629A63CF1D9AFE0E0089679F /* Release */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ CLANG_ANALYZER_NONNULL = YES;
+ CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+ CLANG_CXX_LIBRARY = "libc++";
+ CLANG_ENABLE_MODULES = YES;
+ CLANG_ENABLE_OBJC_ARC = YES;
+ CLANG_WARN_BOOL_CONVERSION = YES;
+ CLANG_WARN_CONSTANT_CONVERSION = YES;
+ CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+ CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+ CLANG_WARN_EMPTY_BODY = YES;
+ CLANG_WARN_ENUM_CONVERSION = YES;
+ CLANG_WARN_INFINITE_RECURSION = YES;
+ CLANG_WARN_INT_CONVERSION = YES;
+ CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+ CLANG_WARN_SUSPICIOUS_MOVES = YES;
+ CLANG_WARN_UNREACHABLE_CODE = YES;
+ CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+ CODE_SIGN_IDENTITY = "-";
+ COPY_PHASE_STRIP = NO;
+ DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+ ENABLE_NS_ASSERTIONS = NO;
+ ENABLE_STRICT_OBJC_MSGSEND = YES;
+ GCC_C_LANGUAGE_STANDARD = gnu99;
+ GCC_NO_COMMON_BLOCKS = YES;
+ GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+ GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+ GCC_WARN_UNDECLARED_SELECTOR = YES;
+ GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+ GCC_WARN_UNUSED_FUNCTION = YES;
+ GCC_WARN_UNUSED_VARIABLE = YES;
+ MACOSX_DEPLOYMENT_TARGET = 10.11;
+ MTL_ENABLE_DEBUG_INFO = NO;
+ SDKROOT = macosx;
+ };
+ name = Release;
+ };
+ 629A63D11D9AFE0E0089679F /* Debug */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ PRODUCT_NAME = "$(TARGET_NAME)";
+ SWIFT_VERSION = 3.0;
+ };
+ name = Debug;
+ };
+ 629A63D21D9AFE0E0089679F /* Release */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ PRODUCT_NAME = "$(TARGET_NAME)";
+ SWIFT_VERSION = 3.0;
+ };
+ name = Release;
+ };
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+ 629A63C41D9AFE0E0089679F /* Build configuration list for PBXProject "HTMLKitExample" */ = {
+ isa = XCConfigurationList;
+ buildConfigurations = (
+ 629A63CE1D9AFE0E0089679F /* Debug */,
+ 629A63CF1D9AFE0E0089679F /* Release */,
+ );
+ defaultConfigurationIsVisible = 0;
+ defaultConfigurationName = Release;
+ };
+ 629A63D01D9AFE0E0089679F /* Build configuration list for PBXNativeTarget "HTMLKitExample" */ = {
+ isa = XCConfigurationList;
+ buildConfigurations = (
+ 629A63D11D9AFE0E0089679F /* Debug */,
+ 629A63D21D9AFE0E0089679F /* Release */,
+ );
+ defaultConfigurationIsVisible = 0;
+ };
+/* End XCConfigurationList section */
+ };
+ rootObject = 629A63C11D9AFE0E0089679F /* Project object */;
+}
diff --git a/Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.xcworkspace/contents.xcworkspacedata
new file mode 100644
index 0000000..8a71bc6
--- /dev/null
+++ b/Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -0,0 +1,7 @@
+
+
+
+
+
diff --git a/Example/HTMLKitExample/HTMLKitExample/main.swift b/Example/HTMLKitExample/HTMLKitExample/main.swift
new file mode 100644
index 0000000..2e51ce9
--- /dev/null
+++ b/Example/HTMLKitExample/HTMLKitExample/main.swift
@@ -0,0 +1,167 @@
+//
+// main.swift
+// HTMLKitExample
+//
+// Created by Iska on 27/09/16.
+// Copyright © 2016 iabudiab. All rights reserved.
+//
+
+import HTMLKit
+
+// Simple scraper that is able to load a page, query via CSS Selectors, and following links
+class Scraper {
+
+ enum ScrapingError: Error {
+ case DocumentNotLoaded
+ case ElementNotFound(String)
+ case InvalidAnchorUrl(String)
+ case CouldNotLoadPage(URL)
+ }
+
+ private var url: URL
+ private(set) var document: HTMLDocument?
+
+ init(url: URL) {
+ self.url = url
+ self.document = nil
+ }
+
+ func load() throws {
+ try loadDocument(at: url)
+ }
+
+ func listElements(matching selector: CSSSelector) throws -> [HTMLElement] {
+ guard let document = document else {
+ throw ScrapingError.DocumentNotLoaded
+ }
+
+ return document.elements(matching: selector)
+ }
+
+ func followLink(matchingSelector selector: CSSSelector) throws {
+ guard let document = document else {
+ throw ScrapingError.DocumentNotLoaded
+ }
+
+ guard let link = document.firstElement(matching: selector) else {
+ throw ScrapingError.ElementNotFound(selector.debugDescription)
+ }
+
+ guard let targetUrl = URL(string: link["href"], relativeTo: url) else {
+ throw ScrapingError.InvalidAnchorUrl(link["href"])
+ }
+
+ try loadDocument(at: targetUrl)
+ }
+
+ private func loadDocument(at url: URL) throws {
+ guard let content = try? String(contentsOf: url) else {
+ throw ScrapingError.CouldNotLoadPage(url)
+ }
+ document = HTMLDocument(string: content)
+ }
+}
+
+// A custom block-based selector, that matches only elements having the given text content:
+// i.e. textContentSelector("Hello") will match Hello
and Hello
+// but wont match World
or Hello there
+func textContentSelector(text: String) -> CSSSelector {
+ return namedBlockSelector("[@textContent='\(text)']") { (element) -> Bool in
+ return element.textContent == text
+ }
+}
+
+// Helper function to create a typed-selector matching an anchor element that has the given
+// text content.
+func anchorElement(havingContent: String) -> CSSSelector {
+ return allOf(
+ [
+ typeSelector("a"),
+ textContentSelector(text: havingContent)
+ ]
+ )
+}
+
+// Helper function to print the content of a github repository file content
+func printRepositoryFile(element: HTMLElement) {
+
+ // A node iterator filter that iterates only elements of class "content" i.e. |
+ let contentIterator = element.nodeIterator(showOptions: .element) { (node) -> HTMLNodeFilterValue in
+ guard let element = node as? HTMLElement else { return .reject }
+
+ if element.tagName == "td" && element["class"] == "content" {
+ return .accept
+ }
+
+ return .reject
+ }
+
+ for td in contentIterator {
+ // The cast is necessary because Swift3 wont import the generics info of the NSEnumerator class
+ // i.e. the nextObject() function alwasy has the following signature 'func nextObject() -> Any?'
+ let title = (td as AnyObject).textContent.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines)
+ print("- \(title)")
+ }
+}
+
+let htmlKitUrl = URL(string: "https://github.com/iabudiab/HTMLKit")!
+let scraper = Scraper(url: htmlKitUrl)
+
+do {
+ // Load the page
+ try scraper.load()
+
+ // Parse the selector
+ let repositoryContent = try CSSSelectorParser.parseSelector("[role='main'] .repository-content > .file-wrap > .files tr.js-navigation-item")
+
+ // Query matching elements
+ let files = try scraper.listElements(matching: repositoryContent)
+
+ print("HTMLKit repositroy root:")
+ files.forEach(printRepositoryFile)
+} catch let error {
+ print(error)
+}
+
+do {
+ // Follow some links
+ try scraper.followLink(matchingSelector: anchorElement(havingContent: "Sources"))
+ try scraper.followLink(matchingSelector: anchorElement(havingContent: "HTMLEOFToken.m"))
+
+ // The following selector: "[role='main'] div.file table.js-file-line-container td:nth-child(2)"
+ // can be defined in type-safe manner:
+ let selector = allOf([
+ descendantOfElementSelector(
+ attributeSelector(.exactMatch, "role", "main")
+ ),
+ descendantOfElementSelector(
+ allOf([
+ typeSelector("div"),
+ classSelector("file")
+ ])
+ ),
+ descendantOfElementSelector(
+ allOf([
+ typeSelector("table"),
+ classSelector("js-file-line-container")
+ ])
+ ),
+ typeSelector("td"),
+ nthChildSelector(
+ CSSNthExpressionMake(0, 2)
+ )
+ ])
+
+ // Query matching elements
+ let elements = try scraper.listElements(matching: selector)
+
+ // This will print the source code for the "HTMLEOFToken.m" file under this url:
+ // https://github.com/iabudiab/HTMLKit/blob/master/Sources/HTMLEOFToken.m
+
+ print("\nHTMLEOFToken:")
+ elements.forEach {
+ print($0.textContent)
+ }
+} catch let error {
+ print(error)
+}
diff --git a/HTMLKit.xcworkspace/contents.xcworkspacedata b/HTMLKit.xcworkspace/contents.xcworkspacedata
index 878a400..da4baa1 100644
--- a/HTMLKit.xcworkspace/contents.xcworkspacedata
+++ b/HTMLKit.xcworkspace/contents.xcworkspacedata
@@ -7,4 +7,7 @@
+
+
|