Source code for richy.news.downloader
import logging
import re
from datetime import datetime
from urllib.parse import urlparse
import feedparser
import requests
from django.conf import settings
from django.utils.timezone import make_aware
from karpet import Karpet
from metadata_parser import MetadataParser
from ..core.scraper import Downloader
from ..news.models import News
logger = logging.getLogger(__name__)
[docs]
class BaseManager:
[docs]
def is_on_blacklist(self, url):
"""
Checks if URL is on blacklist (NEWS_BLACKLIST).
Check is based on server name ("www." is stripped off if found).
:param url: URL to be checked.
:return: Boolean - True if on the list False otherwise.
"""
r = urlparse(url)
# Normalize server URL.
if r.netloc.startswith("www."):
server = r.netloc[4:]
else:
server = r.netloc
# Check for blacklist.
if server in settings.NEWS_BLACKLIST:
return True
[docs]
def is_video(self, url):
if re.match("^http(s)?://finance.yahoo.com/video/", url):
return True
return False
[docs]
class ShareIndexEtfManager(BaseManager):
def __init__(self, item):
self.item = item
[docs]
def fetch(self):
"""
Fetches news for self.item.
Uses YAHOO's RSS feed (NEWS_FEED) and for each
item in the feed tries to download metadata inside
self.create_news().
Also uses blacklist from settings - NEWS_BLACKLIST.
:return: Number of downloaded news.
"""
# Fetches the feed.
session = Downloader.get_client()
resp = session.get(
settings.NEWS_FEED.format(
f"^{self.item.symbol}"
if hasattr(self.item, "index")
else self.item.symbol
),
timeout=(5, 15),
)
resp.raise_for_status()
feed = feedparser.parse(resp.content)
count = 0
for e in feed["entries"]:
# Parses the target news URL out of YAHOO redirect script.
try:
url = self.parse_target(e["link"])
except Exception:
logger.error(
"Skipping {} because of error.".format(e["link"]), exc_info=True
)
continue
# Try to find already downloaded news (based on the URL).
# If found -> skip.
if News.objects.filter(url=url).exists() or self.is_on_blacklist(url):
logger.debug(
"News from URL {} already downloaded or on blacklist. Skipping ...".format(
url
)
)
continue
logger.debug("Downloading {}.".format(url))
# Perform metadata fetch.
try:
news = self.create_news(url, e)
except Exception:
logger.exception("Couldn't create share/index/etf news.")
continue
logger.debug(
"News from URL {} for {} has been downloaded and saved as news {}.".format(
e["link"], self.item.symbol, news.pk
)
)
count += 1
return count
[docs]
def parse_target(self, url):
"""
If given URL is YAHOO's redirect URL
tries to fetch the target URL from the script that
is returned by YAHOO on origin URL.
If not return origin URL.
:param url: RSS feed URL.
:return: Target URL.
"""
# If URL is not YAHOO's redirect url, return it.
if not re.match("^http(s)?://finance.yahoo.com/r/", url):
return url
# Fetch Javascript where is the hidden URL.
response = requests.get(url, timeout=Downloader.DEFAULT_TIMEOUT)
response.content.decode("utf-8")
# Parse out URL.
return re.search(
"URL=\\'(.+)\\'", response.content.decode("utf-8"), re.DOTALL
).group(1)
[docs]
def create_news(self, url, e):
"""
Tries to parse our metadata and save them to the news.
If no metadata or an exception occurred, RSS data are used.
:param url: News URL.
:param e: RSS feed item data.
:return: News model instance - the newly created news.
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
try:
page = MetadataParser(
url,
search_head_only=False,
url_headers=headers,
requests_timeout=Downloader.DEFAULT_TIMEOUT,
)
news = News(
title=(
page.get_metadatas("title")[0]
if page.get_metadatas("title")
else e["title"]
),
description=(
page.get_metadatas("description")[0]
if page.get_metadatas("description")
else e["description"]
),
image=(
page.get_metadatas("image")[0]
if page.get_metadatas("image")
else None
),
date=make_aware(datetime(*(e["published_parsed"])[:6])),
)
except Exception:
news = News(
title=e["title"],
description=e["description"],
date=make_aware(datetime(*(e["published_parsed"])[:6])),
)
news.url = url
news.item = self.item
news.is_video = self.is_video(url)
news.save()
return news
[docs]
class CoinManager(BaseManager):
def __init__(self, item):
self.item = item
[docs]
def fetch(self):
"""
Fetches news with karpet library.
Also uses blacklist from settings - NEWS_BLACKLIST.
:return: Number of downloaded news.
"""
k = Karpet()
count = 0
for n in k.fetch_news(self.item.symbol, limit=30):
if News.objects.filter(url=n["url"]).exists() or self.is_on_blacklist(
n["url"]
):
logger.debug(
f"News from URL {n['url']} already downloaded or on blacklist. Skipping ..."
)
continue
try:
news = News(
title=n["title"],
description=n["description"],
image=n["image"],
date=n["date"],
item=self.item,
url=n["url"],
)
news.save()
except Exception:
logger.exception("Couldn't create coin news.")
continue
count += 1
return count