diff --git a/README.md b/README.md index 78fc345..0bcd152 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,23 @@ * [Features](#features) * [Usage](#usage) - * [URL Extraction](#url-extraction) - * [Customizing URL Extraction](#customizing-url-extraction) - * [Domain Parsing](#domain-parsingn) - * [URL Parsing](#url-parsing) + * [Extraction](#extraction) + * [Domains](#domains) + * [Customizing Domain Extractor](#customizing-domain-extractor) + * [URLs](#urls) + * [Customizing URL Extractor](#customizing-url-extractor) + * [Parsing](#parsing) + * [Domains](#domains) + * [URLs](#urls) * [Contributing](#contributing) * [Licensing](#licensing) * [Credits](#credits) - * [Contributors](#contributors) - * [Similar Projects](#similar-projects) + * [Contributors](#contributors) + * [Similar Projects](#similar-projects) ## Features +* **Flexible Domain Extraction:** Extract domains from text using regular expressions. * **Flexible URL Extraction:** Extract URLs from text using regular expressions. * **Domain Parsing:** Parse domains into subdomains, root domains, and top-level domains (TLDs). * **Extended URL Parsing:** Extend the standard `net/url` package in Go with additional fields and capabilities. @@ -38,7 +43,60 @@ This command will download and install the `hq-go-url` package into your Go work Below are examples demonstrating how to use the different features of the `hq-go-url` package. -### URL Extraction +### Extraction + +> [!NOTE] +> Since Extraction API is centered around [regexp.Regexp](https://golang.org/pkg/regexp/#Regexp), many other methods are available + +#### Domains + +You can extract domains from a given text string using the Extractor. Here's a simple example: + +```go +package main + +import ( + "fmt" + hqgourl "github.com/hueristiq/hq-go-url" + "regexp" +) + +func main() { + extractor := hqgourl.NewDomainExtractor() + text := "Check out this website: https://example.com and send an email to info@example.com." + + regex := extractor.CompileRegex() + matches := regex.FindAllString(text, -1) + + fmt.Println("Found Domain:", matches) +} +``` + +##### Customizing Domain Extractor + +You can customize how domains are extracted by specifying URL schemes, hosts, or providing custom regular expression patterns. + +* Extract domains with TLD Pattern: + + ```go + extractor := hqgourl.NewDomainExtractor( + hqgourl.DomainExtractorWithTLDPattern(`(?:com|net|org)`), + ) + ``` + + This configuration will extract only domains with `com`, `net`, or `org` TLDs. + +* Extract domains with Root Domain Pattern: + + ```go + extractor := hqgourl.NewDomainExtractor( + hqgourl.DomainExtractorWithRootDomainPattern(`(?:example|rootdomain)`), // Custom root domain pattern + ) + ``` + + This configuration will extract domains that have `example` or `rootdomain` root domain. + +#### URLs You can extract URLs from a given text string using the Extractor. Here's a simple example: @@ -46,51 +104,50 @@ You can extract URLs from a given text string using the Extractor. Here's a simp package main import ( - "fmt" - hqgourl "github.com/hueristiq/hq-go-url" - "regexp" + "fmt" + hqgourl "github.com/hueristiq/hq-go-url" + "regexp" ) func main() { - extr := hqgourl.NewExtractor() - text := "Check out this website: https://example.com and send an email to info@example.com." - - regex := extr.CompileRegex() - matches := regex.FindAllString(text, -1) - - fmt.Println("Found URLs:", matches) + extractor := hqgourl.NewExtractor() + text := "Check out this website: https://example.com and send an email to info@example.com." + + regex := extractor.CompileRegex() + matches := regex.FindAllString(text, -1) + + fmt.Println("Found URLs:", matches) } ``` -#### Customizing URL Extraction +##### Customizing URL Extractor You can customize how URLs are extracted by specifying URL schemes, hosts, or providing custom regular expression patterns. -* Extract URLs with Specific Schemes (e.g., HTTP, HTTPS, FTP): +* Extract URLs with Schemes Pattern: - ```go - extr := hqgourl.NewExtractor( - hqgourl.ExtractorWithSchemePattern(`(?:https?|ftp)://`), - ) - ``` + ```go + extractor := hqgourl.NewExtractor( + hqgourl.ExtractorWithSchemePattern(`(?:https?|ftp)://`), + ) + ``` - This configuration will extract only URLs starting with http, https, or ftp schemes. + This configuration will extract URLs with `http`, `https`, or `ftp` schemes. -* Extract URLs with Custom Host Patterns (e.g., example.com): +* Extract URLs with Host Pattern: - ```go - extr := hqgourl.NewExtractor( - hqgourl.ExtractorWithHostPattern(`(?:www\.)?example\.com`), - ) + ```go + extractor := hqgourl.NewExtractor( + hqgourl.ExtractorWithHostPattern(`(?:www\.)?example\.com`), + ) - ``` + ``` - This setup will extract URLs that have hosts matching www.example.com or example.com. + This configuration will extract URLs that have hosts matching `www.example.com` or `example.com`. -> [!NOTE] -> Since API is centered around [regexp.Regexp](https://golang.org/pkg/regexp/#Regexp), many other methods are available +### Parsing -### Domain Parsing +#### Domains The `DomainParser` can parse domains into their components, such as subdomains, root domains, and TLDs: @@ -98,20 +155,20 @@ The `DomainParser` can parse domains into their components, such as subdomains, package main import ( - "fmt" - hqgourl "github.com/hueristiq/hq-go-url" + "fmt" + hqgourl "github.com/hueristiq/hq-go-url" ) func main() { - dp := hqgourl.NewDomainParser() + dp := hqgourl.NewDomainParser() - parsedDomain := dp.Parse("subdomain.example.com") + parsedDomain := dp.Parse("subdomain.example.com") - fmt.Printf("Subdomain: %s, Root Domain: %s, TLD: %s\n", parsedDomain.Sub, parsedDomain.Root, parsedDomain.TopLevel) + fmt.Printf("Subdomain: %s, Root Domain: %s, TLD: %s\n", parsedDomain.Sub, parsedDomain.Root, parsedDomain.TopLevel) } ``` -### URL Parsing +#### URLs The `Parser` provides an extended way to parse URLs, including additional fields like port and file extension: @@ -119,25 +176,25 @@ The `Parser` provides an extended way to parse URLs, including additional fields package main import ( - "fmt" - hqgourl "github.com/hueristiq/hq-go-url" + "fmt" + hqgourl "github.com/hueristiq/hq-go-url" ) func main() { - up := hqgourl.NewParser() + up := hqgourl.NewParser() - parsedURL, err := up.Parse("https://subdomain.example.com:8080/path/file.txt") - if err != nil { - fmt.Println("Error parsing URL:", err) + parsedURL, err := up.Parse("https://subdomain.example.com:8080/path/file.txt") + if err != nil { + fmt.Println("Error parsing URL:", err) - return - } + return + } - fmt.Printf("Subdomain: %s\n", parsedURL.Domain.Sub) - fmt.Printf("Root Domain: %s\n", parsedURL.Domain.Root) - fmt.Printf("TLD: %s\n", parsedURL.Domain.TopLevel) - fmt.Printf("Port: %d\n", parsedURL.Port) - fmt.Printf("File Extension: %s\n", parsedURL.Extension) + fmt.Printf("Subdomain: %s\n", parsedURL.Domain.Sub) + fmt.Printf("Root Domain: %s\n", parsedURL.Domain.Root) + fmt.Printf("TLD: %s\n", parsedURL.Domain.TopLevel) + fmt.Printf("Port: %d\n", parsedURL.Port) + fmt.Printf("File Extension: %s\n", parsedURL.Extension) } ``` diff --git a/domain_extractor.go b/domain_extractor.go new file mode 100644 index 0000000..a410f49 --- /dev/null +++ b/domain_extractor.go @@ -0,0 +1,125 @@ +package url + +import ( + "regexp" + "unicode/utf8" + + "github.com/hueristiq/hq-go-url/tlds" +) + +// DomainExtractor is responsible for extracting domain names, including both root domains +// and top-level domains (TLDs), using regular expressions. It provides flexibility in the +// domain extraction process by allowing custom patterns for both root domains and TLDs. +type DomainExtractor struct { + RootDomainPattern string // Custom regex pattern for matching the root domain (e.g., "example"). + TopLevelDomainPattern string // Custom regex pattern for matching the TLD (e.g., "com"). +} + +// CompileRegex compiles a regular expression based on the configured DomainExtractor. +// It builds a regex that can match domains, combining the root domain pattern with the top-level domain (TLD) pattern. +// The method separates ASCII and Unicode TLDs and includes a punycode pattern to handle internationalized domain names (IDNs). +// It also ensures that the regex captures the longest possible domain match. +// +// Returns: +// - regex: The compiled regular expression for matching domain names. +func (e *DomainExtractor) CompileRegex() (regex *regexp.Regexp) { + // Default root domain pattern or use a user-specified one. + RootDomainPattern := _subdomainPattern + + if e.RootDomainPattern != "" { + RootDomainPattern = `(?:\w+[.])*` + e.RootDomainPattern + `\.` + } + + // Define a pattern for known TLDs, including punycode, ASCII TLDs, and Unicode TLDs. + // Separate ASCII TLDs from Unicode TLDs for the regular expression. + var asciiTLDs, unicodeTLDs []string + + for i, tld := range tlds.Official { + if tld[0] >= utf8.RuneSelf { + asciiTLDs = tlds.Official[:i:i] + unicodeTLDs = tlds.Official[i:] + + break + } + } + + // Define regular expression components for known TLDs and domains. + punycode := `xn--[a-z0-9-]+` + TopLevelDomainPattern := `(?:(?i)` + punycode + `|` + anyOf(append(asciiTLDs, tlds.Pseudo...)...) + `\b|` + anyOf(unicodeTLDs...) + `)` + + if e.TopLevelDomainPattern != "" { + TopLevelDomainPattern = e.TopLevelDomainPattern + } + + // Combine the root domain and TLD patterns to form the complete domain pattern. + pattern := `(?:` + RootDomainPattern + TopLevelDomainPattern + `)` + + if e.RootDomainPattern == "" && e.TopLevelDomainPattern == "" { + pattern = `(?:` + RootDomainPattern + TopLevelDomainPattern + `|localhost)` + } + + // Compile the regex and set it to find the longest possible match. + regex = regexp.MustCompile(pattern) + + regex.Longest() + + return +} + +// DomainExtractorOptionsFunc defines a function type for configuring a DomainExtractor. +// It allows setting options like custom patterns for root domains and TLDs. +type DomainExtractorOptionsFunc func(*DomainExtractor) + +// DomainExtractorInterface defines the interface for domain extraction functionality. +// It ensures that any domain extractor can compile regular expressions to match domain names. +type DomainExtractorInterface interface { + CompileRegex() (regex *regexp.Regexp) +} + +// Ensure that DomainExtractor implements the DomainExtractorInterface. +var _ DomainExtractorInterface = &DomainExtractor{} + +// NewDomainExtractor creates and initializes a DomainExtractor with optional configurations. +// By default, it uses pre-defined patterns for extracting root domains and TLDs, but custom +// patterns can be applied using the provided options. +// +// Returns: +// - extractor: A pointer to the initialized DomainExtractor. +func NewDomainExtractor(opts ...DomainExtractorOptionsFunc) (extractor *DomainExtractor) { + extractor = &DomainExtractor{} + + // Apply any provided options to customize the extractor. + for _, opt := range opts { + opt(extractor) + } + + return +} + +// DomainExtractorWithRootDomainPattern returns an option function to configure the DomainExtractor +// with a custom regex pattern for matching root domains (e.g., "example" in "example.com"). +// +// Parameters: +// - pattern: The custom root domain regex pattern. +// +// Returns: +// - A function that applies the custom root domain pattern to the DomainExtractor. +func DomainExtractorWithRootDomainPattern(pattern string) DomainExtractorOptionsFunc { + return func(e *DomainExtractor) { + e.RootDomainPattern = pattern + } +} + +// DomainExtractorWithTLDPattern returns an option function to configure the DomainExtractor +// with a custom regex pattern for matching top-level domains (TLDs) (e.g., "com" in "example.com"). +// +// Parameters: +// - pattern: The custom TLD regex pattern. +// +// Returns: +// - A function that applies the custom TLD pattern to the DomainExtractor. +func DomainExtractorWithTLDPattern(pattern string) DomainExtractorOptionsFunc { + return func(e *DomainExtractor) { + e.TopLevelDomainPattern = pattern + } +} diff --git a/domain_extractor_test.go b/domain_extractor_test.go new file mode 100644 index 0000000..f71e54b --- /dev/null +++ b/domain_extractor_test.go @@ -0,0 +1,223 @@ +package url_test + +import ( + "testing" + + hqgourl "github.com/hueristiq/hq-go-url" + "github.com/hueristiq/hq-go-url/tlds" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDomainExtractor_CompileRegex_Default(t *testing.T) { + t.Parallel() + + // Initialize DomainExtractor with default settings. + extractor := hqgourl.NewDomainExtractor() + + // Compile the regex. + regex := extractor.CompileRegex() + + // Ensure the regex is not nil. + require.NotNil(t, regex) + + // Test that the regex matches valid domain patterns. + tests := []struct { + input string + expected bool + }{ + {"example.com", true}, + {"www.example.com", true}, + {"http://www.example.com", true}, + {"localhost", true}, + {"http://localhost", true}, + {"example.localhost", true}, + {"http://example.localhost", true}, + {"xn--example-q9a.com", true}, // IDN with punycode. + {"example.co.uk", true}, + {"invalid_domain", false}, + {"just_text", false}, + {"ftp://example.com", true}, + } + + for _, tt := range tests { + assert.Equalf(t, tt.expected, regex.MatchString(tt.input), "failed on input: %s", tt.input) + } +} + +func TestDomainExtractor_CompileRegex_CustomRootDomainPattern(t *testing.T) { + t.Parallel() + + // Initialize DomainExtractor with a custom root domain pattern. + extractor := hqgourl.NewDomainExtractor( + hqgourl.DomainExtractorWithRootDomainPattern(`(?:example|rootdomain)`), // Custom root domain pattern + ) + + // Compile the regex. + regex := extractor.CompileRegex() + + // Ensure the regex is not nil. + require.NotNil(t, regex) + + // Test cases for custom root domain pattern. + tests := []struct { + input string + expected bool + }{ + {"rootdomain.com", true}, + {"my-root-domain.org", false}, + {"not_valid_domain", false}, + {"example.com", true}, + {"www.example.com", true}, + {"localhost", false}, + } + + for _, tt := range tests { + assert.Equalf(t, tt.expected, regex.MatchString(tt.input), "failed on input: %s", tt.input) + } +} + +func TestDomainExtractor_CompileRegex_CustomTLDPattern(t *testing.T) { + t.Parallel() + + // Initialize DomainExtractor with a custom TLD pattern. + extractor := hqgourl.NewDomainExtractor( + hqgourl.DomainExtractorWithTLDPattern(`(?:com|net|org)`), // Custom TLD pattern + ) + + // Compile the regex. + regex := extractor.CompileRegex() + + // Ensure the regex is not nil. + require.NotNil(t, regex) + + // Test cases for custom TLD pattern. + tests := []struct { + input string + expected bool + }{ + {"example.com", true}, + {"example.org", true}, + {"example.net", true}, + {"example.co.uk", false}, + {"localhost", false}, + } + + for _, tt := range tests { + assert.Equalf(t, tt.expected, regex.MatchString(tt.input), "failed on input: %s", tt.input) + } +} + +func TestDomainExtractor_CompileRegex_CustomRootDomainAndTLDPattern(t *testing.T) { + t.Parallel() + + // Initialize DomainExtractor with custom root domain and TLD patterns. + extractor := hqgourl.NewDomainExtractor( + hqgourl.DomainExtractorWithRootDomainPattern(`[a-zA-Z0-9-]+`), + hqgourl.DomainExtractorWithTLDPattern(`(?:com|net)`), + ) + + // Compile the regex. + regex := extractor.CompileRegex() + + // Ensure the regex is not nil. + require.NotNil(t, regex) + + // Test cases for custom root domain and TLD pattern. + tests := []struct { + input string + expected bool + }{ + {"example.com", true}, + {"example.net", true}, + {"example.org", false}, // TLD pattern restricts to com/net. + {"localhost", false}, + {"subdomain.example.com", true}, + } + + for _, tt := range tests { + assert.Equalf(t, tt.expected, regex.MatchString(tt.input), "failed on input: %s", tt.input) + } +} + +func TestDomainExtractor_CompileRegex_TLDSeparation(t *testing.T) { + t.Parallel() + + // Simulate a scenario where the TLDs include both ASCII and Unicode values. + originalTLDs := tlds.Official + tlds.Official = []string{"com", "org", "xn--unicode", "测试"} + + // Initialize the DomainExtractor. + extractor := hqgourl.NewDomainExtractor() + + // Compile the regex. + regex := extractor.CompileRegex() + + // Restore the original TLD list. + t.Cleanup(func() { tlds.Official = originalTLDs }) + + // Ensure the regex is not nil. + require.NotNil(t, regex) + + // Test cases for ASCII and Unicode TLDs. + tests := []struct { + input string + expected bool + }{ + {"example.com", true}, + {"example.org", true}, + {"example.测试", true}, // Unicode TLD. + {"example.xn--unicode", true}, // Punycode. + {"example.co.uk", false}, // TLD not in the list. + {"localhost", true}, + } + + for _, tt := range tests { + assert.Equalf(t, tt.expected, regex.MatchString(tt.input), "failed on input: %s", tt.input) + } +} + +func TestDomainExtractor_CustomPatterns_Failures(t *testing.T) { + t.Parallel() + + // Test with invalid root domain and TLD patterns. + extractor := hqgourl.NewDomainExtractor( + hqgourl.DomainExtractorWithRootDomainPattern(`[invalid`), // Invalid regex pattern + hqgourl.DomainExtractorWithTLDPattern(`(`), // Invalid regex pattern + ) + + // Expecting a panic due to invalid regex patterns. + assert.Panics(t, func() { + extractor.CompileRegex() + }, "Expected panic with invalid regex patterns") +} + +func TestDomainExtractor_CustomPatterns_Empty(t *testing.T) { + t.Parallel() + + // Test with empty custom root domain and TLD patterns. + extractor := hqgourl.NewDomainExtractor( + hqgourl.DomainExtractorWithRootDomainPattern(""), + hqgourl.DomainExtractorWithTLDPattern(""), + ) + + // Compile the regex. + regex := extractor.CompileRegex() + + // Ensure the regex is not nil. + require.NotNil(t, regex) + + // The regex should fall back to default behavior. + tests := []struct { + input string + expected bool + }{ + {"example.com", true}, + {"localhost", true}, + {"invalid_domain", false}, + } + + for _, tt := range tests { + assert.Equalf(t, tt.expected, regex.MatchString(tt.input), "failed on input: %s", tt.input) + } +} diff --git a/go.mod b/go.mod index 9f7e329..a81cb21 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,11 @@ module github.com/hueristiq/hq-go-url go 1.23.1 + +require github.com/stretchr/testify v1.9.0 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum index e69de29..60ce688 100644 --- a/go.sum +++ b/go.sum @@ -0,0 +1,10 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=