feat(Domain Extractor): Add Domain Extrator

hueristiq · Oct 20, 2024 · a9e1f60 · a9e1f60
1 parent 896b7af
commit a9e1f60
Show file tree

Hide file tree

Showing 5 changed files with 477 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -8,18 +8,23 @@
 
 * [Features](#features)
 * [Usage](#usage)
-    * [URL Extraction](#url-extraction)
-        * [Customizing URL Extraction](#customizing-url-extraction)
-    * [Domain Parsing](#domain-parsingn)
-    * [URL Parsing](#url-parsing)
+	* [Extraction](#extraction)
+		* [Domains](#domains)
+			* [Customizing Domain Extractor](#customizing-domain-extractor)
+		* [URLs](#urls)
+			* [Customizing URL Extractor](#customizing-url-extractor)
+	* [Parsing](#parsing)
+		* [Domains](#domains)
+		* [URLs](#urls)
 * [Contributing](#contributing)
 * [Licensing](#licensing)
 * [Credits](#credits)
-    * [Contributors](#contributors)
-    * [Similar Projects](#similar-projects)
+	* [Contributors](#contributors)
+	* [Similar Projects](#similar-projects)
 
 ## Features
 
+* **Flexible Domain Extraction:** Extract domains from text using regular expressions.
 * **Flexible URL Extraction:** Extract URLs from text using regular expressions.
 * **Domain Parsing:** Parse domains into subdomains, root domains, and top-level domains (TLDs).
 * **Extended URL Parsing:** Extend the standard `net/url` package in Go with additional fields and capabilities.
@@ -38,106 +43,158 @@ This command will download and install the `hq-go-url` package into your Go work
 
 Below are examples demonstrating how to use the different features of the `hq-go-url` package.
 
-### URL Extraction
+### Extraction
+
+> [!NOTE]
+> Since Extraction API is centered around [regexp.Regexp](https://golang.org/pkg/regexp/#Regexp), many other methods are available
+
+#### Domains
+
+You can extract domains from a given text string using the Extractor. Here's a simple example:
+
+```go
+package main
+
+import (
+	"fmt"
+	hqgourl "github.com/hueristiq/hq-go-url"
+	"regexp"
+)
+
+func main() {
+	extractor := hqgourl.NewDomainExtractor()
+	text := "Check out this website: https://example.com and send an email to [email protected]."
+
+	regex := extractor.CompileRegex()
+	matches := regex.FindAllString(text, -1)
+
+	fmt.Println("Found Domain:", matches)
+}
+```
+
+##### Customizing Domain Extractor
+
+You can customize how domains are extracted by specifying URL schemes, hosts, or providing custom regular expression patterns.
+
+* Extract domains with TLD Pattern:
+
+	```go
+	extractor := hqgourl.NewDomainExtractor(
+		hqgourl.DomainExtractorWithTLDPattern(`(?:com|net|org)`),
+	)
+	```
+
+	This configuration will extract only domains with `com`, `net`, or `org` TLDs.
+
+* Extract domains with Root Domain Pattern:
+
+	```go
+	extractor := hqgourl.NewDomainExtractor(
+		hqgourl.DomainExtractorWithRootDomainPattern(`(?:example|rootdomain)`), // Custom root domain pattern
+	)
+	```
+
+	This configuration will extract domains that have `example` or `rootdomain` root domain.
+
+#### URLs
 
 You can extract URLs from a given text string using the Extractor. Here's a simple example:
 
 ```go
 package main
 
 import (
-    "fmt"
-    hqgourl "github.com/hueristiq/hq-go-url"
-    "regexp"
+	"fmt"
+	hqgourl "github.com/hueristiq/hq-go-url"
+	"regexp"
 )
 
 func main() {
-    extr := hqgourl.NewExtractor()
-    text := "Check out this website: https://example.com and send an email to [email protected]."
-    
-    regex := extr.CompileRegex()
-    matches := regex.FindAllString(text, -1)
-    
-    fmt.Println("Found URLs:", matches)
+	extractor := hqgourl.NewExtractor()
+	text := "Check out this website: https://example.com and send an email to [email protected]."
+
+	regex := extractor.CompileRegex()
+	matches := regex.FindAllString(text, -1)
+
+	fmt.Println("Found URLs:", matches)
 }
 ```
 
-#### Customizing URL Extraction
+##### Customizing URL Extractor
 
 You can customize how URLs are extracted by specifying URL schemes, hosts, or providing custom regular expression patterns.
 
-* Extract URLs with Specific Schemes (e.g., HTTP, HTTPS, FTP):
+* Extract URLs with Schemes Pattern:
 
-    ```go
-    extr := hqgourl.NewExtractor(
-        hqgourl.ExtractorWithSchemePattern(`(?:https?|ftp)://`),
-    )
-    ```
+	```go
+	extractor := hqgourl.NewExtractor(
+		hqgourl.ExtractorWithSchemePattern(`(?:https?|ftp)://`),
+	)
+	```
 
-    This configuration will extract only URLs starting with http, https, or ftp schemes.
+	This configuration will extract URLs with `http`, `https`, or `ftp` schemes.
 
-* Extract URLs with Custom Host Patterns (e.g., example.com):
+* Extract URLs with Host Pattern:
 
-    ```go
-    extr := hqgourl.NewExtractor(
-        hqgourl.ExtractorWithHostPattern(`(?:www\.)?example\.com`),
-    )
+	```go
+	extractor := hqgourl.NewExtractor(
+		hqgourl.ExtractorWithHostPattern(`(?:www\.)?example\.com`),
+	)
 
-    ```
+	```
 
-    This setup will extract URLs that have hosts matching www.example.com or example.com.
+	This configuration will extract URLs that have hosts matching `www.example.com` or `example.com`.
 
-> [!NOTE]
-> Since API is centered around [regexp.Regexp](https://golang.org/pkg/regexp/#Regexp), many other methods are available
+### Parsing
 
-### Domain Parsing
+#### Domains
 
 The `DomainParser` can parse domains into their components, such as subdomains, root domains, and TLDs:
 
 ```go
 package main
 
 import (
-    "fmt"
-    hqgourl "github.com/hueristiq/hq-go-url"
+	"fmt"
+	hqgourl "github.com/hueristiq/hq-go-url"
 )
 
 func main() {
-    dp := hqgourl.NewDomainParser()
+	dp := hqgourl.NewDomainParser()
 
-    parsedDomain := dp.Parse("subdomain.example.com")
+	parsedDomain := dp.Parse("subdomain.example.com")
 
-    fmt.Printf("Subdomain: %s, Root Domain: %s, TLD: %s\n", parsedDomain.Sub, parsedDomain.Root, parsedDomain.TopLevel)
+	fmt.Printf("Subdomain: %s, Root Domain: %s, TLD: %s\n", parsedDomain.Sub, parsedDomain.Root, parsedDomain.TopLevel)
 }
 ```
 
-### URL Parsing
+#### URLs
 
 The `Parser` provides an extended way to parse URLs, including additional fields like port and file extension:
 
 ```go
 package main
 
 import (
-    "fmt"
-    hqgourl "github.com/hueristiq/hq-go-url"
+	"fmt"
+	hqgourl "github.com/hueristiq/hq-go-url"
 )
 
 func main() {
-    up := hqgourl.NewParser()
+	up := hqgourl.NewParser()
 
-    parsedURL, err := up.Parse("https://subdomain.example.com:8080/path/file.txt")
-    if err != nil {
-        fmt.Println("Error parsing URL:", err)
+	parsedURL, err := up.Parse("https://subdomain.example.com:8080/path/file.txt")
+	if err != nil {
+		fmt.Println("Error parsing URL:", err)
 
-        return
-    }
+		return
+	}
 
-    fmt.Printf("Subdomain: %s\n", parsedURL.Domain.Sub)
-    fmt.Printf("Root Domain: %s\n", parsedURL.Domain.Root)
-    fmt.Printf("TLD: %s\n", parsedURL.Domain.TopLevel)
-    fmt.Printf("Port: %d\n", parsedURL.Port)
-    fmt.Printf("File Extension: %s\n", parsedURL.Extension)
+	fmt.Printf("Subdomain: %s\n", parsedURL.Domain.Sub)
+	fmt.Printf("Root Domain: %s\n", parsedURL.Domain.Root)
+	fmt.Printf("TLD: %s\n", parsedURL.Domain.TopLevel)
+	fmt.Printf("Port: %d\n", parsedURL.Port)
+	fmt.Printf("File Extension: %s\n", parsedURL.Extension)
 }
 ```
 

diff --git a/domain_extractor.go b/domain_extractor.go
@@ -0,0 +1,125 @@
+package url
+
+import (
+	"regexp"
+	"unicode/utf8"
+
+	"github.com/hueristiq/hq-go-url/tlds"
+)
+
+// DomainExtractor is responsible for extracting domain names, including both root domains
+// and top-level domains (TLDs), using regular expressions. It provides flexibility in the
+// domain extraction process by allowing custom patterns for both root domains and TLDs.
+type DomainExtractor struct {
+	RootDomainPattern     string // Custom regex pattern for matching the root domain (e.g., "example").
+	TopLevelDomainPattern string // Custom regex pattern for matching the TLD (e.g., "com").
+}
+
+// CompileRegex compiles a regular expression based on the configured DomainExtractor.
+// It builds a regex that can match domains, combining the root domain pattern with the top-level domain (TLD) pattern.
+// The method separates ASCII and Unicode TLDs and includes a punycode pattern to handle internationalized domain names (IDNs).
+// It also ensures that the regex captures the longest possible domain match.
+//
+// Returns:
+//   - regex: The compiled regular expression for matching domain names.
+func (e *DomainExtractor) CompileRegex() (regex *regexp.Regexp) {
+	// Default root domain pattern or use a user-specified one.
+	RootDomainPattern := _subdomainPattern
+
+	if e.RootDomainPattern != "" {
+		RootDomainPattern = `(?:\w+[.])*` + e.RootDomainPattern + `\.`
+	}
+
+	// Define a pattern for known TLDs, including punycode, ASCII TLDs, and Unicode TLDs.
+	// Separate ASCII TLDs from Unicode TLDs for the regular expression.
+	var asciiTLDs, unicodeTLDs []string
+
+	for i, tld := range tlds.Official {
+		if tld[0] >= utf8.RuneSelf {
+			asciiTLDs = tlds.Official[:i:i]
+			unicodeTLDs = tlds.Official[i:]
+
+			break
+		}
+	}
+
+	// Define regular expression components for known TLDs and domains.
+	punycode := `xn--[a-z0-9-]+`
+	TopLevelDomainPattern := `(?:(?i)` + punycode + `|` + anyOf(append(asciiTLDs, tlds.Pseudo...)...) + `\b|` + anyOf(unicodeTLDs...) + `)`
+
+	if e.TopLevelDomainPattern != "" {
+		TopLevelDomainPattern = e.TopLevelDomainPattern
+	}
+
+	// Combine the root domain and TLD patterns to form the complete domain pattern.
+	pattern := `(?:` + RootDomainPattern + TopLevelDomainPattern + `)`
+
+	if e.RootDomainPattern == "" && e.TopLevelDomainPattern == "" {
+		pattern = `(?:` + RootDomainPattern + TopLevelDomainPattern + `|localhost)`
+	}
+
+	// Compile the regex and set it to find the longest possible match.
+	regex = regexp.MustCompile(pattern)
+
+	regex.Longest()
+
+	return
+}
+
+// DomainExtractorOptionsFunc defines a function type for configuring a DomainExtractor.
+// It allows setting options like custom patterns for root domains and TLDs.
+type DomainExtractorOptionsFunc func(*DomainExtractor)
+
+// DomainExtractorInterface defines the interface for domain extraction functionality.
+// It ensures that any domain extractor can compile regular expressions to match domain names.
+type DomainExtractorInterface interface {
+	CompileRegex() (regex *regexp.Regexp)
+}
+
+// Ensure that DomainExtractor implements the DomainExtractorInterface.
+var _ DomainExtractorInterface = &DomainExtractor{}
+
+// NewDomainExtractor creates and initializes a DomainExtractor with optional configurations.
+// By default, it uses pre-defined patterns for extracting root domains and TLDs, but custom
+// patterns can be applied using the provided options.
+//
+// Returns:
+//   - extractor: A pointer to the initialized DomainExtractor.
+func NewDomainExtractor(opts ...DomainExtractorOptionsFunc) (extractor *DomainExtractor) {
+	extractor = &DomainExtractor{}
+
+	// Apply any provided options to customize the extractor.
+	for _, opt := range opts {
+		opt(extractor)
+	}
+
+	return
+}
+
+// DomainExtractorWithRootDomainPattern returns an option function to configure the DomainExtractor
+// with a custom regex pattern for matching root domains (e.g., "example" in "example.com").
+//
+// Parameters:
+//   - pattern: The custom root domain regex pattern.
+//
+// Returns:
+//   - A function that applies the custom root domain pattern to the DomainExtractor.
+func DomainExtractorWithRootDomainPattern(pattern string) DomainExtractorOptionsFunc {
+	return func(e *DomainExtractor) {
+		e.RootDomainPattern = pattern
+	}
+}
+
+// DomainExtractorWithTLDPattern returns an option function to configure the DomainExtractor
+// with a custom regex pattern for matching top-level domains (TLDs) (e.g., "com" in "example.com").
+//
+// Parameters:
+//   - pattern: The custom TLD regex pattern.
+//
+// Returns:
+//   - A function that applies the custom TLD pattern to the DomainExtractor.
+func DomainExtractorWithTLDPattern(pattern string) DomainExtractorOptionsFunc {
+	return func(e *DomainExtractor) {
+		e.TopLevelDomainPattern = pattern
+	}
+}