From 64de62e8d7b1ec8ccc72ded5dee2d34eff96939d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= <jindrichbar@gmail.com>
Date: Mon, 6 Jan 2025 17:18:50 +0100
Subject: [PATCH 1/4] docs: more details about different `ProxyConfiguration`
 options

---
 docs/guides/proxy_management.mdx | 63 +++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 2 deletions(-)
diff --git a/docs/guides/proxy_management.mdx b/docs/guides/proxy_management.mdx
index 50ef9b949713..0327314fcd80 100644
--- a/docs/guides/proxy_management.mdx
+++ b/docs/guides/proxy_management.mdx
@@ -61,7 +61,66 @@ Examples of how to use our proxy URLs with crawlers are shown below in [Crawler
 
 All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> class. We create an instance using the `ProxyConfiguration` <ApiLink to="core/class/ProxyConfiguration#constructor">`constructor`</ApiLink> function based on the provided options. See the <ApiLink to="core/interface/ProxyConfigurationOptions">`ProxyConfigurationOptions`</ApiLink> for all the possible constructor options.
 
-### Crawler integration
+### Static proxy list
+
+We can provide a list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies.
+
+```javascript
+const proxyConfiguration = new ProxyConfiguration({
+    proxyUrls: [
+        'http://proxy-1.com',
+        'http://proxy-2.com',
+        null // null means no proxy is used
+    ]
+});
+```
+
+This is a simple way to use a list of proxies. Crawlee will rotate through the list of proxies in a round-robin fashion.
+
+### Custom proxy function
+
+The `ProxyConfiguration` class allows us to provide a custom function to pick a proxy URL. This is useful when we want to implement our own logic for selecting a proxy.
+
+```javascript
+const proxyConfiguration = new ProxyConfiguration({
+    newUrlFunction: (sessionId, { request }) => {
+        if (request?.url.includes('crawlee.dev')) {
+            return null; // for crawlee.dev, we don't use a proxy
+        }
+
+        return 'http://proxy-1.com'; // for all other URLs, we use this proxy
+    }
+});
+```
+
+The `newUrlFunction` receives two parameters - `sessionId` and `options` - and returns a string containing the proxy URL.
+
+The `sessionId` parameter is always provided and allows us to differentiate between different sessions - e.g. when Crawlee recognizes your crawlers are being blocked, it will automatically create a new session with a different id.
+
+The `options` parameter is an object containing the `request` object, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly.
+Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not.
+
+### Tiered proxies
+
+We can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when we want switch between different proxies automatically based on the blocking behavior of the website.
+
+```javascript
+const proxyConfiguration = new ProxyConfiguration({
+    tieredProxyUrls: [
+        [null],
+        ['http://okay-proxy.com'],
+        ['http://slightly-better-proxy.com', 'http://slightly-better-proxy-2.com'],
+        ['http://very-good-and-expensive-proxy.com'],
+    ]
+});
+```
+
+This configuration will start with no proxy, then switch to `http://okay-proxy.com` if Crawlee recognized we're getting blocked by the target website. 
+If that proxy is also blocked, we will switch to one of the `slightly-better-proxy` URLs. If those are blocked, we will switch to the `very-good-and-expensive-proxy.com` URL.
+
+Crawlee also periodically probes lower tier proxies to see if they are unblocked, and if they are, it will switch back to them.
+
+## Crawler integration
 
 `ProxyConfiguration` integrates seamlessly into <ApiLink to="http-crawler/class/HttpCrawler">`HttpCrawler`</ApiLink>, <ApiLink to="cheerio-crawler/class/CheerioCrawler">`CheerioCrawler`</ApiLink>, <ApiLink to="jsdom-crawler/class/JSDOMCrawler">`JSDOMCrawler`</ApiLink>, <ApiLink to="playwright-crawler/class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and <ApiLink to="puppeteer-crawler/class/PuppeteerCrawler">`PuppeteerCrawler`</ApiLink>.
 
@@ -95,7 +154,7 @@ All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguratio
 
 Our crawlers will now use the selected proxies for all connections.
 
-### IP Rotation and session management
+## IP Rotation and session management
 
 &#8203;<ApiLink to="core/class/ProxyConfiguration#newUrl">`proxyConfiguration.newUrl()`</ApiLink> allows us to pass a `sessionId` parameter. It will then be used to create a `sessionId`-`proxyUrl` pair, and subsequent `newUrl()` calls with the same `sessionId` will always return the same `proxyUrl`. This is extremely useful in scraping, because we want to create the impression of a real user. See the [session management guide](../guides/session-management) and <ApiLink to="core/class/SessionPool">`SessionPool`</ApiLink> class for more information on how keeping a real session helps us avoid blocking.
 

From 667d96055763c45d0949218b73e9696d83016e93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= <jindrichbar@gmail.com>
Date: Tue, 7 Jan 2025 10:49:45 +0100
Subject: [PATCH 2/4] docs: small refactor

---
 docs/guides/proxy_management.mdx | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/docs/guides/proxy_management.mdx b/docs/guides/proxy_management.mdx
index 0327314fcd80..fdf1a41182d7 100644
--- a/docs/guides/proxy_management.mdx
+++ b/docs/guides/proxy_management.mdx
@@ -59,11 +59,13 @@ Examples of how to use our proxy URLs with crawlers are shown below in [Crawler
 
 ## Proxy Configuration
 
-All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> class. We create an instance using the `ProxyConfiguration` <ApiLink to="core/class/ProxyConfiguration#constructor">`constructor`</ApiLink> function based on the provided options. See the <ApiLink to="core/interface/ProxyConfigurationOptions">`ProxyConfigurationOptions`</ApiLink> for all the possible constructor options.
+All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> class. 
+We create an instance using the `ProxyConfiguration` <ApiLink to="core/class/ProxyConfiguration#constructor">`constructor`</ApiLink> function based on the provided options. 
+See the <ApiLink to="core/interface/ProxyConfigurationOptions">`ProxyConfigurationOptions`</ApiLink> for all the possible constructor options.
 
 ### Static proxy list
 
-We can provide a list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies.
+We can provide a static list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies.
 
 ```javascript
 const proxyConfiguration = new ProxyConfiguration({
@@ -75,7 +77,7 @@ const proxyConfiguration = new ProxyConfiguration({
 });
 ```
 
-This is a simple way to use a list of proxies. Crawlee will rotate through the list of proxies in a round-robin fashion.
+This is the simplest way to use a list of proxies. Crawlee will rotate through the list of proxies in a round-robin fashion.
 
 ### Custom proxy function
 
@@ -97,17 +99,25 @@ The `newUrlFunction` receives two parameters - `sessionId` and `options` - and r
 
 The `sessionId` parameter is always provided and allows us to differentiate between different sessions - e.g. when Crawlee recognizes your crawlers are being blocked, it will automatically create a new session with a different id.
 
-The `options` parameter is an object containing the `request` object, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly.
+The `options` parameter is an object containing a <ApiLink to="core/class/Request">`Request`</ApiLink>, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly.
 Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not.
 
 ### Tiered proxies
 
 We can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when we want switch between different proxies automatically based on the blocking behavior of the website.
 
+:::warning
+
+Note that the `tieredProxyUrls` option requires `ProxyConfiguration` to be used from a crawler instance ([see below](#crawler-integration)). 
+
+Using this configuration through the `newUrl` calls will not yield the expected results.
+
+:::
+
 ```javascript
 const proxyConfiguration = new ProxyConfiguration({
     tieredProxyUrls: [
-        [null],
+        [null], // At first, we try to connect without a proxy
         ['http://okay-proxy.com'],
         ['http://slightly-better-proxy.com', 'http://slightly-better-proxy-2.com'],
         ['http://very-good-and-expensive-proxy.com'],

From f9cadbff80996f418c40f19bde53ff5ebf7ec6f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= <jindrichbar@gmail.com>
Date: Fri, 17 Jan 2025 14:56:59 +0100
Subject: [PATCH 3/4] chore: apply PR review suggestions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Martin Adámek <banan23@gmail.com>
---
 docs/guides/proxy_management.mdx | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/docs/guides/proxy_management.mdx b/docs/guides/proxy_management.mdx
index fdf1a41182d7..8bf385f1c5b5 100644
--- a/docs/guides/proxy_management.mdx
+++ b/docs/guides/proxy_management.mdx
@@ -59,13 +59,11 @@ Examples of how to use our proxy URLs with crawlers are shown below in [Crawler
 
 ## Proxy Configuration
 
-All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> class. 
-We create an instance using the `ProxyConfiguration` <ApiLink to="core/class/ProxyConfiguration#constructor">`constructor`</ApiLink> function based on the provided options. 
-See the <ApiLink to="core/interface/ProxyConfigurationOptions">`ProxyConfigurationOptions`</ApiLink> for all the possible constructor options.
+All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> class. We create an instance using the `ProxyConfiguration` <ApiLink to="core/class/ProxyConfiguration#constructor">`constructor`</ApiLink> function based on the provided options. See the <ApiLink to="core/interface/ProxyConfigurationOptions">`ProxyConfigurationOptions`</ApiLink> for all the possible constructor options.
 
 ### Static proxy list
 
-We can provide a static list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies.
+You can provide a static list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies.
 
 ```javascript
 const proxyConfiguration = new ProxyConfiguration({
@@ -81,7 +79,7 @@ This is the simplest way to use a list of proxies. Crawlee will rotate through t
 
 ### Custom proxy function
 
-The `ProxyConfiguration` class allows us to provide a custom function to pick a proxy URL. This is useful when we want to implement our own logic for selecting a proxy.
+The `ProxyConfiguration` class allows you to provide a custom function to pick a proxy URL. This is useful when you want to implement your own logic for selecting a proxy.
 
 ```javascript
 const proxyConfiguration = new ProxyConfiguration({
@@ -99,12 +97,11 @@ The `newUrlFunction` receives two parameters - `sessionId` and `options` - and r
 
 The `sessionId` parameter is always provided and allows us to differentiate between different sessions - e.g. when Crawlee recognizes your crawlers are being blocked, it will automatically create a new session with a different id.
 
-The `options` parameter is an object containing a <ApiLink to="core/class/Request">`Request`</ApiLink>, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly.
-Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not.
+The `options` parameter is an object containing a <ApiLink to="core/class/Request">`Request`</ApiLink>, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly. Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not.
 
 ### Tiered proxies
 
-We can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when we want switch between different proxies automatically based on the blocking behavior of the website.
+You can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when you want to switch between different proxies automatically based on the blocking behavior of the website.
 
 :::warning
 
@@ -125,8 +122,7 @@ const proxyConfiguration = new ProxyConfiguration({
 });
 ```
 
-This configuration will start with no proxy, then switch to `http://okay-proxy.com` if Crawlee recognized we're getting blocked by the target website. 
-If that proxy is also blocked, we will switch to one of the `slightly-better-proxy` URLs. If those are blocked, we will switch to the `very-good-and-expensive-proxy.com` URL.
+This configuration will start with no proxy, then switch to `http://okay-proxy.com` if Crawlee recognizes we're getting blocked by the target website. If that proxy is also blocked, we will switch to one of the `slightly-better-proxy` URLs. If those are blocked, we will switch to the `very-good-and-expensive-proxy.com` URL.
 
 Crawlee also periodically probes lower tier proxies to see if they are unblocked, and if they are, it will switch back to them.
 

From 49b78e6eaf82a8800afeaeb7a89e551ee993ac0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= <jindrichbar@gmail.com>
Date: Fri, 17 Jan 2025 14:01:08 +0000
Subject: [PATCH 4/4] docs: update the `ProxyConfiguration` guide in
 `version-3.12`

---
 .../version-3.12/guides/proxy_management.mdx  | 69 ++++++++++++++++++-
 1 file changed, 67 insertions(+), 2 deletions(-)

diff --git a/website/versioned_docs/version-3.12/guides/proxy_management.mdx b/website/versioned_docs/version-3.12/guides/proxy_management.mdx
index 50ef9b949713..8bf385f1c5b5 100644
--- a/website/versioned_docs/version-3.12/guides/proxy_management.mdx
+++ b/website/versioned_docs/version-3.12/guides/proxy_management.mdx
@@ -61,7 +61,72 @@ Examples of how to use our proxy URLs with crawlers are shown below in [Crawler
 
 All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> class. We create an instance using the `ProxyConfiguration` <ApiLink to="core/class/ProxyConfiguration#constructor">`constructor`</ApiLink> function based on the provided options. See the <ApiLink to="core/interface/ProxyConfigurationOptions">`ProxyConfigurationOptions`</ApiLink> for all the possible constructor options.
 
-### Crawler integration
+### Static proxy list
+
+You can provide a static list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies.
+
+```javascript
+const proxyConfiguration = new ProxyConfiguration({
+    proxyUrls: [
+        'http://proxy-1.com',
+        'http://proxy-2.com',
+        null // null means no proxy is used
+    ]
+});
+```
+
+This is the simplest way to use a list of proxies. Crawlee will rotate through the list of proxies in a round-robin fashion.
+
+### Custom proxy function
+
+The `ProxyConfiguration` class allows you to provide a custom function to pick a proxy URL. This is useful when you want to implement your own logic for selecting a proxy.
+
+```javascript
+const proxyConfiguration = new ProxyConfiguration({
+    newUrlFunction: (sessionId, { request }) => {
+        if (request?.url.includes('crawlee.dev')) {
+            return null; // for crawlee.dev, we don't use a proxy
+        }
+
+        return 'http://proxy-1.com'; // for all other URLs, we use this proxy
+    }
+});
+```
+
+The `newUrlFunction` receives two parameters - `sessionId` and `options` - and returns a string containing the proxy URL.
+
+The `sessionId` parameter is always provided and allows us to differentiate between different sessions - e.g. when Crawlee recognizes your crawlers are being blocked, it will automatically create a new session with a different id.
+
+The `options` parameter is an object containing a <ApiLink to="core/class/Request">`Request`</ApiLink>, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly. Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not.
+
+### Tiered proxies
+
+You can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when you want to switch between different proxies automatically based on the blocking behavior of the website.
+
+:::warning
+
+Note that the `tieredProxyUrls` option requires `ProxyConfiguration` to be used from a crawler instance ([see below](#crawler-integration)). 
+
+Using this configuration through the `newUrl` calls will not yield the expected results.
+
+:::
+
+```javascript
+const proxyConfiguration = new ProxyConfiguration({
+    tieredProxyUrls: [
+        [null], // At first, we try to connect without a proxy
+        ['http://okay-proxy.com'],
+        ['http://slightly-better-proxy.com', 'http://slightly-better-proxy-2.com'],
+        ['http://very-good-and-expensive-proxy.com'],
+    ]
+});
+```
+
+This configuration will start with no proxy, then switch to `http://okay-proxy.com` if Crawlee recognizes we're getting blocked by the target website. If that proxy is also blocked, we will switch to one of the `slightly-better-proxy` URLs. If those are blocked, we will switch to the `very-good-and-expensive-proxy.com` URL.
+
+Crawlee also periodically probes lower tier proxies to see if they are unblocked, and if they are, it will switch back to them.
+
+## Crawler integration
 
 `ProxyConfiguration` integrates seamlessly into <ApiLink to="http-crawler/class/HttpCrawler">`HttpCrawler`</ApiLink>, <ApiLink to="cheerio-crawler/class/CheerioCrawler">`CheerioCrawler`</ApiLink>, <ApiLink to="jsdom-crawler/class/JSDOMCrawler">`JSDOMCrawler`</ApiLink>, <ApiLink to="playwright-crawler/class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and <ApiLink to="puppeteer-crawler/class/PuppeteerCrawler">`PuppeteerCrawler`</ApiLink>.
 
@@ -95,7 +160,7 @@ All our proxy needs are managed by the <ApiLink to="core/class/ProxyConfiguratio
 
 Our crawlers will now use the selected proxies for all connections.
 
-### IP Rotation and session management
+## IP Rotation and session management
 
 &#8203;<ApiLink to="core/class/ProxyConfiguration#newUrl">`proxyConfiguration.newUrl()`</ApiLink> allows us to pass a `sessionId` parameter. It will then be used to create a `sessionId`-`proxyUrl` pair, and subsequent `newUrl()` calls with the same `sessionId` will always return the same `proxyUrl`. This is extremely useful in scraping, because we want to create the impression of a real user. See the [session management guide](../guides/session-management) and <ApiLink to="core/class/SessionPool">`SessionPool`</ApiLink> class for more information on how keeping a real session helps us avoid blocking.