Skip to content

Commit

Permalink
import_feed: Handle shifted base RSS feeds and redirects
Browse files Browse the repository at this point in the history
* Actually consider generated redirects
* Do not squash post sub-folder structure (like date folders)
* Only generate redirects if URLs change
* Support having blog posts in a differently named top folder

Issue: getnikola#389
  • Loading branch information
aigarius committed Jun 27, 2021
1 parent 2b68522 commit 9a8e44a
Showing 1 changed file with 50 additions and 31 deletions.
81 changes: 50 additions & 31 deletions v7/import_feed/import_feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
import time

try:
from urlparse import urlparse
from urlparse import urlparse, urlunparse
except ImportError:
from urllib.parse import urlparse # NOQA
from urllib.parse import urlparse, urlunparse # NOQA

try:
import feedparser
Expand Down Expand Up @@ -60,9 +60,16 @@ class CommandImportFeed(Command, ImportMixin):
'name': 'output_folder',
'long': 'output-folder',
'short': 'o',
'default': 'new_site',
'default': '../new_site',
'help': 'Location to write imported content.'
},
{
'name': 'base_name',
'long': 'base-name',
'short': 'b',
'default': 'posts',
'help': 'Top folder of the blog posts URL'
},
{
'name': 'url',
'long': 'url',
Expand All @@ -85,17 +92,19 @@ def _execute(self, options, args):
return

self.feed_url = options['url']
self.base_name = options['base_name']
self.output_folder = options['output_folder']
self.import_into_existing_site = False
self.url_map = {}
channel = self.get_channel_from_file(self.feed_url)
self.context = self.populate_context(channel)
self.context = self.populate_context(channel, self.base_name)
conf_template = self.generate_base_site()
self.context['REDIRECTIONS'] = self.configure_redirections(
self.url_map)

self.import_posts(channel)

self.context['REDIRECTIONS'] = self.configure_redirections(
self.url_map)

self.write_configuration(self.get_configuration_output_path(
), conf_template.render(**prepare_config(self.context)))

Expand All @@ -104,20 +113,23 @@ def get_channel_from_file(cls, filename):
return feedparser.parse(filename)

@staticmethod
def populate_context(channel):
def populate_context(channel, base_name):
context = SAMPLE_CONF.copy()
context['DEFAULT_LANG'] = channel.feed.title_detail.language \
if channel.feed.title_detail.language else 'en'
context['BLOG_TITLE'] = channel.feed.title

context['BLOG_DESCRIPTION'] = channel.feed.get('subtitle', '')
context['SITE_URL'] = channel.feed.get('link', '').rstrip('/')
site_url = urlparse(channel.feed.get('link', ''))
site_url = site_url._replace(path="/", params="", query="", fragment="")
context['SITE_URL'] = urlunparse(site_url)
context['BASE_URL'] = channel.feed.get('link', '')
context['BLOG_EMAIL'] = channel.feed.author_detail.get('email', '') if 'author_detail' in channel.feed else ''
context['BLOG_AUTHOR'] = channel.feed.author_detail.get('name', '') if 'author_detail' in channel.feed else ''

context['POSTS'] = '''(
("posts/*.html", "posts", "post.tmpl"),
)'''
("{0}/*.html", "{0}", "post.tmpl"),
)'''.format(base_name)
context['PAGES'] = '''(
("stories/*.html", "stories", "story.tmpl"),
)'''
Expand All @@ -135,17 +147,22 @@ def import_posts(self, channel):
self.process_item(item)

def process_item(self, item):
self.import_item(item, 'posts')
self.import_item(item, self.base_name)

def import_item(self, item, out_folder=None):
def import_item(self, item, out_folder):
"""Takes an item from the feed and creates a post file."""
if out_folder is None:
out_folder = 'posts'

# link is something like http://foo.com/2012/09/01/hello-world/
# So, take the path, utils.slugify it, and that's our slug
# So, lets remove the BASE_URL from it to get real path
link = item.link
link_path = urlparse(link).path

# TODO - link may be without domain

if not link.startswith(self.context["BASE_URL"]):
LOGGER.error("Foreign URL found in feed: %s", link)
return

link_path = link[len(self.context["BASE_URL"]):]

title = item.title

Expand All @@ -155,14 +172,19 @@ def import_item(self, item, out_folder=None):
"as placeholder, please fix.".format(link))
title = "NO_TITLE"

if link_path.lower().endswith('.html'):
link_path = link_path[:-5]

slug = utils.slugify(link_path)
file_path = os.path.join(*[utils.slugify(x) for x in link_path.split("/")])
file_path = os.path.join(out_folder, file_path)

if not slug: # should never happen
LOGGER.error("Error converting post:", title)
return
if "/" + file_path != urlparse(link).path:
LOGGER.info("URL moved from %s to %s", urlparse(link).path, file_path)
do_link = True
else:
do_link = False

if file_path.endswith("/"):
file_path += "index.html"
slug = utils.slugify(link_path)

description = ''
try:
Expand Down Expand Up @@ -192,21 +214,18 @@ def import_item(self, item, out_folder=None):
else:
is_draft = False

self.url_map[link] = self.context['SITE_URL'] + '/' + \
out_folder + '/' + slug + '.html'

if is_draft and self.exclude_drafts:
LOGGER.notice('Draft "{0}" will not be imported.'.format(title))
elif content.strip():
# If no content is found, no files are written.
content = self.transform_content(content)

self.write_metadata(os.path.join(self.output_folder, out_folder,
slug + '.meta'),
title, slug, post_date, description, tags)
self.write_content(
os.path.join(self.output_folder, out_folder, slug + '.html'),
content)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
self.write_metadata(os.path.join(self.output_folder, file_path[:-len(".html")] + '.meta'),
title, "", post_date, description, tags)
self.write_content(os.path.join(self.output_folder, file_path), content)
if do_link:
self.url_map[urlparse(link).path] = "/" + file_path
else:
LOGGER.warn('Not going to import "{0}" because it seems to contain'
' no content.'.format(title))
Expand Down

0 comments on commit 9a8e44a

Please sign in to comment.