Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
anezih authored Mar 2, 2023
0 parents commit 692e787
Show file tree
Hide file tree
Showing 2 changed files with 375 additions and 0 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
This plugin tries to parse metadata and cover information from kitapyurdu.com, a Turkish online bookstore.

**Main Features:**
- Plugin is capable of retrieving: title, author, rating, tags, ISBN, publication date, publisher, language, comments(short description for the book).
- Plugin sets **kitapyurdu:** identifier for easy access from details tab. It will show up as **Kitapyurdu**. Additionaly, if you choose to download cover from this source, it will set the **kitapyurdu_kapak** identifer in order to cache cover url. The latter won't be shown in details tab.

**Configuration:**
- You can configure the maximum number of search results to be parsed in plugin settings. Choices are: 20, 25, 50. Default: 20.

<details>
<summary><b>Changelog:</b></summary>
<b>Ver. 1.0.0</b>
<li>Initial release</li>
</details>
361 changes: 361 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,361 @@
import re
from datetime import datetime
from queue import Empty, Queue
from urllib.parse import quote_plus

import mechanize
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.sources.base import Option, Source
from calibre.utils.icu import lower, normalize, remove_accents_icu, title_case

# ENTRIES_PER_SEARCH_RESULT_PAGE = 20

lang_to_eng = {
"Türkçe" : "Turkish",
"İngilizce" : "English",
"İspanyolca" : "Spanish",
"İtalyanca" : "Italian",
"Korece" : "Korean",
"Rusça" : "Russian",
"Almanca" : "German",
"Fransızca" : "French"
}

class KitapyurduMetadata():
title: str = None
author: str = None
editor: str = None
translator: str = None
publisher: str = None
rating: int = None
cover_url: list = None
desc: str = None
date: datetime = None
original_name: str = None
isbn: str = None
lang: str = None
pages_num: int = None
tags: set[str] = None
url: str = None
_id: str = None
cover_id: str = None
source_relevance: int = None

def to_calibre_metadata(self):
mi = Metadata(
title=self.title,
authors=self.author
)
mi.identifiers = {}
mi.identifiers["kitapyurdu"] = self._id
if self.cover_id != "0":
mi.identifiers["kitapyurdu_kapak"] = self.cover_id
if self.isbn:
mi.isbn = self.isbn
if self.publisher:
mi.publisher = self.publisher
if self.rating >= 0:
mi.rating = self.rating
if self.lang:
eng = lang_to_eng.get(self.lang)
if eng:
mi.language = eng
if self.tags:
mi.tags = list(self.tags)
if self.date:
mi.pubdate = self.date
if self.desc:
mi.comments = self.desc
mi.source_relevance = self.source_relevance
return mi

class KitapyurduMetadataParser():
def __init__(self, query, limit, logger) -> None:
self.query = query
self.max_results = limit
self.logger = logger
self.br = mechanize.Browser()

def url_content_from_query(self, query, limit):
quoted = quote_plus(query)
_url = f"https://www.kitapyurdu.com/index.php?route=product/search&filter_name={quoted}&limit={limit}"
try:
r = self.br.open(_url)
content = r.read()
r.close()
return content
except Exception as e:
self.logger.exception(f"Failed to get search results, exception: {e}\nURL was: {_url}")
return None

def search_urls(self, soup):
table = soup.select_one("#product-table")
if table:
products = table.select("div.product-cr")
links = [l.select_one("div.name > a") for l in products]
return [link["href"] for link in links]
else:
return []

def url_content(self, u):
try:
r = self.br.open(u)
content = r.read()
r.close()
return content
except Exception as e:
self.logger.exception(f"Failed to get page content, exception: {e}\nURL was: {u}")
return None

def get_search_page_urls(self, q, lim=20):
content = self.url_content_from_query(query=q, limit=lim)
if content:
soup = BeautifulSoup(content, "lxml")
urls = self.search_urls(soup)
if urls:
return urls
else:
return []
else:
return []

def parse_pages(self):
soups = [(BeautifulSoup(self.url_content(u), "lxml"), u) for u in self.get_search_page_urls(q=self.query, lim=self.max_results) if u]
metadata_list = []
if not soups:
return metadata_list
for soup in soups:
metadata = KitapyurduMetadata()
metadata.url = soup[1]

id_re = re.search("(\d+)\.html", metadata.url)
if id_re.lastindex > 0:
metadata._id = id_re.group(1)

title = soup[0].select_one("h1.pr_header__heading")
if title:
metadata.title = title.getText()

author = soup[0].select("div.pr_producers__manufacturer > div.pr_producers__item")
if author:
metadata.author = [a.getText().strip() for a in author]

publisher = soup[0].select_one("div.pr_producers__publisher")
if publisher:
# all_caps = publisher.getText().strip()
# all_caps = all_caps.replace("I","ı").replace("İ", "i")
metadata.publisher = title_case(publisher.getText().strip())

rating_ul = soup[0].select_one("ul.pr_rating-stars")
if rating_ul:
rating = len(rating_ul.select(".icon__star-big--selected"))
metadata.rating = rating

cover_url_with_res = soup[0].select_one("div.pr_images")
if cover_url_with_res:
multi = cover_url_with_res.select_one("ul.pr_images__thumb-list")
if multi:
metadata.cover_url = [(x := a["href"])[:x.index("wh") - 1] for a in multi.find_all("a")]
else:
jbox = cover_url_with_res.select_one("a.js-jbox-book-cover")["href"]
cover_url = jbox[:jbox.index("wh") - 1]
metadata.cover_url = [cover_url]

if metadata.cover_url:
metadata.cover_id = metadata.cover_url[0].split(":")[-1]

desc = soup[0].select_one("span.info__text")
if desc:
metadata.desc = str(desc)

attrs_table = {}
table = soup[0].select_one("div.attributes").find_all("tr")
tds = [td for t in table for td in t.find_all("td")]
for td in range(len(tds)):
if td % 2 == 1:
key = tds[td-1].getText().strip()
val = tds[td].getText()
if key not in attrs_table.keys():
attrs_table[key] = val
else:
attrs_table[key] += f", {val}"

editors = attrs_table.get("Editor:")
if editors:
metadata.editor = editors.strip()

translators = attrs_table.get("Çevirmen:")
if translators:
metadata.translator = translators.strip()

date = attrs_table.get("Yayın Tarihi:")
if date:
metadata.date = datetime.strptime(date, "%d.%m.%Y")

original_name = attrs_table.get("Orijinal Adı:")
if original_name:
metadata.original_name = original_name

isbn = attrs_table.get("ISBN:")
if isbn:
metadata.isbn = isbn

lang = attrs_table.get("Dil:")
if lang:
# lang = lang.replace("I","ı").replace("İ", "i")
metadata.lang = title_case(lang)

pages_num = attrs_table.get("Sayfa Sayısı:")
if pages_num:
metadata.pages_num = int(pages_num)

tags_ul = soup[0].select_one("ul.rel-cats__list")
tags = {}
if tags_ul:
tags = {t.getText() for t in tags_ul.find_all("span")}
if tags:
if "Kitap" in tags:
tags.remove("Kitap")
if "Diğer" in tags:
tags.remove("Diğer")
metadata.tags = tags
metadata_list.append(metadata)
return metadata_list

class Kitapyurdu(Source):
name = "Kitapyurdu"
author = "Nezih <https://github.com/anezih>"
description = _("Downloads metadata and covers from kitapyurdu.com")
supported_platforms = ["windows", "osx", "linux"]
capabilities = frozenset(["identify", "cover"])
touched_fields = frozenset(
[
"title", "authors", "tags", "publisher", "comments", "pubdate",
"rating", "identifier:isbn", "language", "identifier:kitapyurdu"
]
)
supports_gzip_transfer_encoding = True
cached_cover_url_is_reliable = True
has_html_comments = True
prefer_results_with_isbn = False
can_get_multiple_covers = True

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max_res = self.prefs.get("entries_per_search_result_page")

options = (
Option (
"entries_per_search_result_page",
"choices",
20,
_("Max. number of search results."),
_("Select max. number of search results. (50 may cause timeout errors.)"),
{20:"20", 25:"25", 50:"50"}
),
)

def get_book_url_name(self, idtype, idval, url):
return "Kitapyurdu"

def get_book_url(self, identifiers):
kitapyurdu_id = identifiers.get("kitapyurdu")
if kitapyurdu_id:
url = f"https://www.kitapyurdu.com/kitap/-/{kitapyurdu_id}.html"
return ("kitapyurdu", kitapyurdu_id, url)
else:
return None

def build_query(self, log, title=None, authors=None, only_title=False, rm_accents=False, ss=False, sj=True):
title_tokens = []
author_tokens = []
if title or authors:
title_tokens = list(self.get_title_tokens(title=title, strip_subtitle=ss, strip_joiners=sj))
if not only_title:
author_tokens = list(self.get_author_tokens(authors=authors, only_first_author=True))
all = lower(" ".join(title_tokens + author_tokens))
all = normalize(all)
if rm_accents:
all = remove_accents_icu(all)
log.info(f"Removed accents from query.")
if all:
log.info(f"Constructed query: {all}")
return all
else:
return None

def create_metadata_list(self, log, title=None, authors=None):
title_authors = self.build_query(log=log, title=title, authors=authors)
ky_metadata_obj = KitapyurduMetadataParser(query=title_authors, limit=self.max_res, logger=log)
metadata_list: list[KitapyurduMetadata] = ky_metadata_obj.parse_pages()
if metadata_list:
return metadata_list
else:
log.info(f"Build query second pass: only_title, strip_subtitle, rm_accents")
title_authors = self.build_query(log=log, title=title, authors=authors, only_title=True, rm_accents=True, ss=True)
ky_metadata_obj = KitapyurduMetadataParser(query=title_authors, limit=self.max_res, logger=log)
metadata_list: list[KitapyurduMetadata] = ky_metadata_obj.parse_pages()
if metadata_list:
return metadata_list
else:
return None

def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30):
metadata_list = self.create_metadata_list(log=log, title=title, authors=authors)
if not metadata_list:
return
if metadata_list:
for relevance, mi in enumerate(metadata_list, start=1):
mi.source_relevance = relevance
result_queue.put(mi.to_calibre_metadata())

def get_cached_cover_url(self, identifiers):
_id = identifiers.get('kitapyurdu_kapak')
if _id:
return f"https://img.kitapyurdu.com/v1/getImage/fn:{_id}"
else:
return None

def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers=None, timeout=30, get_best_cover=False):
if abort.is_set():
return
cached = self.get_cached_cover_url(identifiers=identifiers)
if cached:
try:
cover_data = self.browser.open_novisit(cached, timeout=timeout).read()
if cover_data:
result_queue.put((self, cover_data))
except:
log.exception(f"Failed to get covers from: {cached}")
else:
log.info("Could not find cached URL for covers, running identify...")
cached_from_ident = ""
queue = Queue()
self.identify(log, result_queue=queue, abort=abort, title=title, authors=authors, identifiers=identifiers, timeout=30)
if abort.is_set():
return
res = []
while True:
try:
res.append(queue.get_nowait())
except Empty:
break
res.sort(
key=self.identify_results_keygen(
title=title,
authors=authors,
identifiers=identifiers
)
)
for mi in res:
cached = self.get_cached_cover_url(mi.identifiers)
if cached:
cached_from_ident = cached
break
try:
cover_data = self.browser.open_novisit(cached_from_ident, timeout=timeout).read()
if cover_data:
result_queue.put((self, cover_data))
except:
log.exception(f"Failed to get covers from: {cached}")

0 comments on commit 692e787

Please sign in to comment.