-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpolitico.py
49 lines (40 loc) · 1.52 KB
/
politico.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import json
import feedparser
from urllib.parse import urljoin, urlparse
# Load existing URLs from JSON file
with open("politico_urls.json", "r") as politico_file:
politico_json = json.load(politico_file)
# List of RSS feeds
feeds = [
"https://rss.politico.com/congress.xml",
"https://rss.politico.com/healthcare.xml",
"https://rss.politico.com/defense.xml",
"https://rss.politico.com/economy.xml",
"https://rss.politico.com/energy.xml",
"https://rss.politico.com/politics-news.xml"
]
# Parse each feed and extract URLs
for url in feeds:
try:
# Parse the feed
feed = feedparser.parse(url)
# Check for parsing errors
if feed.bozo:
print(f"Warning: Failed to parse feed {url}")
continue
# Process each article in the feed
for article in feed.entries:
# Check if 'link' key exists
if 'link' in article:
raw_link = article['link']
clean_url = urljoin(raw_link, urlparse(raw_link).path)
# Add the URL to politico_json if it's not already present
if clean_url not in politico_json:
politico_json.append(clean_url)
else:
print(f"Warning: Missing 'link' key in article from feed {url}")
except Exception as e:
print(f"Error processing feed {url}: {e}")
# Save updated URLs back to JSON file
with open("politico_urls.json", "w") as f:
json.dump(politico_json, f, indent=4)