Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add positive URL_RULES #33

Merged
merged 1 commit into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@ Configuration is handled through environment variables as listed below:
- Example: `HEADERS=Rate-Limit-Token:ABC123,X-Header:X-Value`
- CONCURRENCY: Controls the number of concurrent requests, useful for controlling request rate.
- Example: `CONCURRENCY=10`
- URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should crawl. All other URLs will be avoided.
- Example: `URL_RULES=https://www.gov.uk/.*`
- DISALLOWED_URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should avoid.
- Example: `DISALLOWED_URL_RULES=/search/.*,/government/.*\.atom`
6 changes: 6 additions & 0 deletions internal/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ func NewClient(c *colly.Collector, redirectHandler func(*http.Request, []*http.R
func isRequestAllowed(c *colly.Collector, parsedURL *url.URL) bool {
u := []byte(parsedURL.String())

for _, r := range c.URLFilters {
if !r.Match(u) {
return false
}
}

for _, r := range c.DisallowedURLFilters {
if r.Match(u) {
return false
Expand Down
8 changes: 8 additions & 0 deletions internal/client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ func TestNewClient(t *testing.T) {
func TestIsRequestAllowedTableDriven(t *testing.T) {
tests := []struct {
name string
allowedURLs []*regexp.Regexp
disallowedURLs []*regexp.Regexp
allowedDomains []string
url string
Expand All @@ -60,6 +61,12 @@ func TestIsRequestAllowedTableDriven(t *testing.T) {
url: "http://example.com",
expectedAllowed: false,
},
{
name: "URL filter",
allowedURLs: []*regexp.Regexp{regexp.MustCompile("https://www.gov.uk")},
url: "http://example.com",
expectedAllowed: false,
},
{
name: "allowed domain",
allowedDomains: []string{"example.com"},
Expand All @@ -78,6 +85,7 @@ func TestIsRequestAllowedTableDriven(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
c := colly.NewCollector()
c.DisallowedURLFilters = tt.disallowedURLs
c.URLFilters = tt.allowedURLs
c.AllowedDomains = tt.allowedDomains
parsedURL, _ := url.Parse(tt.url)
assert.Equal(t, tt.expectedAllowed, isRequestAllowed(c, parsedURL))
Expand Down
1 change: 1 addition & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ type Config struct {
UserAgent string `env:"USER_AGENT" envDefault:"govukbot"`
Headers map[string]string `env:"HEADERS"`
Concurrency int `env:"CONCURRENCY" envDefault:"10"`
URLFilters []*regexp.Regexp `env:"URL_RULES" envSeparator:","`
DisallowedURLFilters []*regexp.Regexp `env:"DISALLOWED_URL_RULES" envSeparator:","`
}

Expand Down
9 changes: 7 additions & 2 deletions internal/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ func TestNewConfig(t *testing.T) {
"USER_AGENT": "custom-agent",
"HEADERS": "Test-Header:Test-Value",
"CONCURRENCY": "20",
"DISALLOWED_URL_RULES": "rule1,rule2",
"URL_RULES": "rule1,rule2",
"DISALLOWED_URL_RULES": "rule3,rule4",
},
expected: &Config{
Site: "example.com",
Expand All @@ -39,10 +40,14 @@ func TestNewConfig(t *testing.T) {
"Test-Header": "Test-Value",
},
Concurrency: 20,
DisallowedURLFilters: []*regexp.Regexp{
URLFilters: []*regexp.Regexp{
regexp.MustCompile("rule1"),
regexp.MustCompile("rule2"),
},
DisallowedURLFilters: []*regexp.Regexp{
regexp.MustCompile("rule3"),
regexp.MustCompile("rule4"),
},
},
},
}
Expand Down
1 change: 1 addition & 0 deletions internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ func newCollector(cfg *config.Config) (*colly.Collector, error) {
c := colly.NewCollector(
colly.UserAgent(cfg.UserAgent),
colly.AllowedDomains(cfg.AllowedDomains...),
colly.URLFilters(cfg.URLFilters...),
colly.DisallowedURLFilters(cfg.DisallowedURLFilters...),
colly.Async(true),
)
Expand Down
7 changes: 7 additions & 0 deletions internal/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ func TestNewCrawler(t *testing.T) {
cfg := &config.Config{
UserAgent: "custom-agent",
AllowedDomains: []string{"example.com"},
URLFilters: []*regexp.Regexp{
regexp.MustCompile(".*"),
},
DisallowedURLFilters: []*regexp.Regexp{
regexp.MustCompile(".*disallowed.*"),
},
Expand All @@ -196,6 +199,7 @@ func TestNewCrawler(t *testing.T) {
assert.IsType(t, &colly.Collector{}, cr.collector)
assert.Equal(t, "custom-agent", cr.collector.UserAgent)
assert.Equal(t, []string{"example.com"}, cr.collector.AllowedDomains)
assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*")}, cr.collector.URLFilters)
assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*disallowed.*")}, cr.collector.DisallowedURLFilters)
assert.Equal(t, true, cr.collector.Async)
}
Expand Down Expand Up @@ -284,6 +288,9 @@ func TestRun(t *testing.T) {
cfg := &config.Config{
Site: ts.URL + "/sitemap.xml",
AllowedDomains: []string{hostname},
URLFilters: []*regexp.Regexp{
regexp.MustCompile(".*"),
},
DisallowedURLFilters: []*regexp.Regexp{
regexp.MustCompile("/disallowed"),
},
Expand Down
Loading