From a751cec2159e38d5ad725c8cecf50aad3e35dbfd Mon Sep 17 00:00:00 2001 From: iska Date: Tue, 27 Sep 2016 22:30:45 +0200 Subject: [PATCH] Add example project --- .../HTMLKitExample.xcodeproj/project.pbxproj | 257 ++++++++++++++++++ .../contents.xcworkspacedata | 7 + .../HTMLKitExample/HTMLKitExample/main.swift | 167 ++++++++++++ HTMLKit.xcworkspace/contents.xcworkspacedata | 3 + 4 files changed, 434 insertions(+) create mode 100644 Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.pbxproj create mode 100644 Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.xcworkspace/contents.xcworkspacedata create mode 100644 Example/HTMLKitExample/HTMLKitExample/main.swift diff --git a/Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.pbxproj b/Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.pbxproj new file mode 100644 index 0000000..e6f0864 --- /dev/null +++ b/Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.pbxproj @@ -0,0 +1,257 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 46; + objects = { + +/* Begin PBXBuildFile section */ + 629A63CD1D9AFE0E0089679F /* main.swift in Sources */ = {isa = PBXBuildFile; fileRef = 629A63CC1D9AFE0E0089679F /* main.swift */; }; +/* End PBXBuildFile section */ + +/* Begin PBXCopyFilesBuildPhase section */ + 629A63C71D9AFE0E0089679F /* CopyFiles */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 2147483647; + dstPath = /usr/share/man/man1/; + dstSubfolderSpec = 0; + files = ( + ); + runOnlyForDeploymentPostprocessing = 1; + }; +/* End PBXCopyFilesBuildPhase section */ + +/* Begin PBXFileReference section */ + 629A63C91D9AFE0E0089679F /* HTMLKitExample */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = HTMLKitExample; sourceTree = BUILT_PRODUCTS_DIR; }; + 629A63CC1D9AFE0E0089679F /* main.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = main.swift; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 629A63C61D9AFE0E0089679F /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 629A63C01D9AFE0E0089679F = { + isa = PBXGroup; + children = ( + 629A63CB1D9AFE0E0089679F /* HTMLKitExample */, + 629A63CA1D9AFE0E0089679F /* Products */, + ); + sourceTree = ""; + }; + 629A63CA1D9AFE0E0089679F /* Products */ = { + isa = PBXGroup; + children = ( + 629A63C91D9AFE0E0089679F /* HTMLKitExample */, + ); + name = Products; + sourceTree = ""; + }; + 629A63CB1D9AFE0E0089679F /* HTMLKitExample */ = { + isa = PBXGroup; + children = ( + 629A63CC1D9AFE0E0089679F /* main.swift */, + ); + path = HTMLKitExample; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + 629A63C81D9AFE0E0089679F /* HTMLKitExample */ = { + isa = PBXNativeTarget; + buildConfigurationList = 629A63D01D9AFE0E0089679F /* Build configuration list for PBXNativeTarget "HTMLKitExample" */; + buildPhases = ( + 629A63C51D9AFE0E0089679F /* Sources */, + 629A63C61D9AFE0E0089679F /* Frameworks */, + 629A63C71D9AFE0E0089679F /* CopyFiles */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = HTMLKitExample; + productName = HTMLKitExample; + productReference = 629A63C91D9AFE0E0089679F /* HTMLKitExample */; + productType = "com.apple.product-type.tool"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 629A63C11D9AFE0E0089679F /* Project object */ = { + isa = PBXProject; + attributes = { + LastSwiftUpdateCheck = 0800; + LastUpgradeCheck = 0800; + ORGANIZATIONNAME = iabudiab; + TargetAttributes = { + 629A63C81D9AFE0E0089679F = { + CreatedOnToolsVersion = 8.0; + ProvisioningStyle = Automatic; + }; + }; + }; + buildConfigurationList = 629A63C41D9AFE0E0089679F /* Build configuration list for PBXProject "HTMLKitExample" */; + compatibilityVersion = "Xcode 3.2"; + developmentRegion = English; + hasScannedForEncodings = 0; + knownRegions = ( + en, + ); + mainGroup = 629A63C01D9AFE0E0089679F; + productRefGroup = 629A63CA1D9AFE0E0089679F /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 629A63C81D9AFE0E0089679F /* HTMLKitExample */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXSourcesBuildPhase section */ + 629A63C51D9AFE0E0089679F /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 629A63CD1D9AFE0E0089679F /* main.swift in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + 629A63CE1D9AFE0E0089679F /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LIBRARY = "libc++"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_SUSPICIOUS_MOVES = YES; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + CODE_SIGN_IDENTITY = "-"; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + MACOSX_DEPLOYMENT_TARGET = 10.11; + MTL_ENABLE_DEBUG_INFO = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = macosx; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + 629A63CF1D9AFE0E0089679F /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LIBRARY = "libc++"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_SUSPICIOUS_MOVES = YES; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + CODE_SIGN_IDENTITY = "-"; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + MACOSX_DEPLOYMENT_TARGET = 10.11; + MTL_ENABLE_DEBUG_INFO = NO; + SDKROOT = macosx; + }; + name = Release; + }; + 629A63D11D9AFE0E0089679F /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_VERSION = 3.0; + }; + name = Debug; + }; + 629A63D21D9AFE0E0089679F /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_VERSION = 3.0; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 629A63C41D9AFE0E0089679F /* Build configuration list for PBXProject "HTMLKitExample" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 629A63CE1D9AFE0E0089679F /* Debug */, + 629A63CF1D9AFE0E0089679F /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 629A63D01D9AFE0E0089679F /* Build configuration list for PBXNativeTarget "HTMLKitExample" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 629A63D11D9AFE0E0089679F /* Debug */, + 629A63D21D9AFE0E0089679F /* Release */, + ); + defaultConfigurationIsVisible = 0; + }; +/* End XCConfigurationList section */ + }; + rootObject = 629A63C11D9AFE0E0089679F /* Project object */; +} diff --git a/Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.xcworkspace/contents.xcworkspacedata new file mode 100644 index 0000000..8a71bc6 --- /dev/null +++ b/Example/HTMLKitExample/HTMLKitExample.xcodeproj/project.xcworkspace/contents.xcworkspacedata @@ -0,0 +1,7 @@ + + + + + diff --git a/Example/HTMLKitExample/HTMLKitExample/main.swift b/Example/HTMLKitExample/HTMLKitExample/main.swift new file mode 100644 index 0000000..2e51ce9 --- /dev/null +++ b/Example/HTMLKitExample/HTMLKitExample/main.swift @@ -0,0 +1,167 @@ +// +// main.swift +// HTMLKitExample +// +// Created by Iska on 27/09/16. +// Copyright © 2016 iabudiab. All rights reserved. +// + +import HTMLKit + +// Simple scraper that is able to load a page, query via CSS Selectors, and following links +class Scraper { + + enum ScrapingError: Error { + case DocumentNotLoaded + case ElementNotFound(String) + case InvalidAnchorUrl(String) + case CouldNotLoadPage(URL) + } + + private var url: URL + private(set) var document: HTMLDocument? + + init(url: URL) { + self.url = url + self.document = nil + } + + func load() throws { + try loadDocument(at: url) + } + + func listElements(matching selector: CSSSelector) throws -> [HTMLElement] { + guard let document = document else { + throw ScrapingError.DocumentNotLoaded + } + + return document.elements(matching: selector) + } + + func followLink(matchingSelector selector: CSSSelector) throws { + guard let document = document else { + throw ScrapingError.DocumentNotLoaded + } + + guard let link = document.firstElement(matching: selector) else { + throw ScrapingError.ElementNotFound(selector.debugDescription) + } + + guard let targetUrl = URL(string: link["href"], relativeTo: url) else { + throw ScrapingError.InvalidAnchorUrl(link["href"]) + } + + try loadDocument(at: targetUrl) + } + + private func loadDocument(at url: URL) throws { + guard let content = try? String(contentsOf: url) else { + throw ScrapingError.CouldNotLoadPage(url) + } + document = HTMLDocument(string: content) + } +} + +// A custom block-based selector, that matches only elements having the given text content: +// i.e. textContentSelector("Hello") will match

Hello

and Hello +// but wont match
World
or

Hello there

+func textContentSelector(text: String) -> CSSSelector { + return namedBlockSelector("[@textContent='\(text)']") { (element) -> Bool in + return element.textContent == text + } +} + +// Helper function to create a typed-selector matching an anchor element that has the given +// text content. +func anchorElement(havingContent: String) -> CSSSelector { + return allOf( + [ + typeSelector("a"), + textContentSelector(text: havingContent) + ] + ) +} + +// Helper function to print the content of a github repository file content +func printRepositoryFile(element: HTMLElement) { + + // A node iterator filter that iterates only elements of class "content" i.e. + let contentIterator = element.nodeIterator(showOptions: .element) { (node) -> HTMLNodeFilterValue in + guard let element = node as? HTMLElement else { return .reject } + + if element.tagName == "td" && element["class"] == "content" { + return .accept + } + + return .reject + } + + for td in contentIterator { + // The cast is necessary because Swift3 wont import the generics info of the NSEnumerator class + // i.e. the nextObject() function alwasy has the following signature 'func nextObject() -> Any?' + let title = (td as AnyObject).textContent.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) + print("- \(title)") + } +} + +let htmlKitUrl = URL(string: "https://github.com/iabudiab/HTMLKit")! +let scraper = Scraper(url: htmlKitUrl) + +do { + // Load the page + try scraper.load() + + // Parse the selector + let repositoryContent = try CSSSelectorParser.parseSelector("[role='main'] .repository-content > .file-wrap > .files tr.js-navigation-item") + + // Query matching elements + let files = try scraper.listElements(matching: repositoryContent) + + print("HTMLKit repositroy root:") + files.forEach(printRepositoryFile) +} catch let error { + print(error) +} + +do { + // Follow some links + try scraper.followLink(matchingSelector: anchorElement(havingContent: "Sources")) + try scraper.followLink(matchingSelector: anchorElement(havingContent: "HTMLEOFToken.m")) + + // The following selector: "[role='main'] div.file table.js-file-line-container td:nth-child(2)" + // can be defined in type-safe manner: + let selector = allOf([ + descendantOfElementSelector( + attributeSelector(.exactMatch, "role", "main") + ), + descendantOfElementSelector( + allOf([ + typeSelector("div"), + classSelector("file") + ]) + ), + descendantOfElementSelector( + allOf([ + typeSelector("table"), + classSelector("js-file-line-container") + ]) + ), + typeSelector("td"), + nthChildSelector( + CSSNthExpressionMake(0, 2) + ) + ]) + + // Query matching elements + let elements = try scraper.listElements(matching: selector) + + // This will print the source code for the "HTMLEOFToken.m" file under this url: + // https://github.com/iabudiab/HTMLKit/blob/master/Sources/HTMLEOFToken.m + + print("\nHTMLEOFToken:") + elements.forEach { + print($0.textContent) + } +} catch let error { + print(error) +} diff --git a/HTMLKit.xcworkspace/contents.xcworkspacedata b/HTMLKit.xcworkspace/contents.xcworkspacedata index 878a400..da4baa1 100644 --- a/HTMLKit.xcworkspace/contents.xcworkspacedata +++ b/HTMLKit.xcworkspace/contents.xcworkspacedata @@ -7,4 +7,7 @@ + +