Source code for richy.news.downloader

import logging
import re
from datetime import datetime
from urllib.parse import urlparse

import feedparser
import requests
from django.conf import settings
from django.utils.timezone import make_aware
from karpet import Karpet
from metadata_parser import MetadataParser

from ..core.scraper import Downloader
from ..news.models import News

logger = logging.getLogger(__name__)


[docs] class BaseManager:
[docs] def is_on_blacklist(self, url): """ Checks if URL is on blacklist (NEWS_BLACKLIST). Check is based on server name ("www." is stripped off if found). :param url: URL to be checked. :return: Boolean - True if on the list False otherwise. """ r = urlparse(url) # Normalize server URL. if r.netloc.startswith("www."): server = r.netloc[4:] else: server = r.netloc # Check for blacklist. if server in settings.NEWS_BLACKLIST: return True
[docs] def is_video(self, url): if re.match("^http(s)?://finance.yahoo.com/video/", url): return True return False
[docs] class ShareIndexEtfManager(BaseManager): def __init__(self, item): self.item = item
[docs] def fetch(self): """ Fetches news for self.item. Uses YAHOO's RSS feed (NEWS_FEED) and for each item in the feed tries to download metadata inside self.create_news(). Also uses blacklist from settings - NEWS_BLACKLIST. :return: Number of downloaded news. """ # Fetches the feed. session = Downloader.get_client() resp = session.get( settings.NEWS_FEED.format( f"^{self.item.symbol}" if hasattr(self.item, "index") else self.item.symbol ), timeout=(5, 15), ) resp.raise_for_status() feed = feedparser.parse(resp.content) count = 0 for e in feed["entries"]: # Parses the target news URL out of YAHOO redirect script. try: url = self.parse_target(e["link"]) except Exception: logger.error( "Skipping {} because of error.".format(e["link"]), exc_info=True ) continue # Try to find already downloaded news (based on the URL). # If found -> skip. if News.objects.filter(url=url).exists() or self.is_on_blacklist(url): logger.debug( "News from URL {} already downloaded or on blacklist. Skipping ...".format( url ) ) continue logger.debug("Downloading {}.".format(url)) # Perform metadata fetch. try: news = self.create_news(url, e) except Exception: logger.exception("Couldn't create share/index/etf news.") continue logger.debug( "News from URL {} for {} has been downloaded and saved as news {}.".format( e["link"], self.item.symbol, news.pk ) ) count += 1 return count
[docs] def parse_target(self, url): """ If given URL is YAHOO's redirect URL tries to fetch the target URL from the script that is returned by YAHOO on origin URL. If not return origin URL. :param url: RSS feed URL. :return: Target URL. """ # If URL is not YAHOO's redirect url, return it. if not re.match("^http(s)?://finance.yahoo.com/r/", url): return url # Fetch Javascript where is the hidden URL. response = requests.get(url, timeout=Downloader.DEFAULT_TIMEOUT) response.content.decode("utf-8") # Parse out URL. return re.search( "URL=\\'(.+)\\'", response.content.decode("utf-8"), re.DOTALL ).group(1)
[docs] def create_news(self, url, e): """ Tries to parse our metadata and save them to the news. If no metadata or an exception occurred, RSS data are used. :param url: News URL. :param e: RSS feed item data. :return: News model instance - the newly created news. """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" } try: page = MetadataParser( url, search_head_only=False, url_headers=headers, requests_timeout=Downloader.DEFAULT_TIMEOUT, ) news = News( title=( page.get_metadatas("title")[0] if page.get_metadatas("title") else e["title"] ), description=( page.get_metadatas("description")[0] if page.get_metadatas("description") else e["description"] ), image=( page.get_metadatas("image")[0] if page.get_metadatas("image") else None ), date=make_aware(datetime(*(e["published_parsed"])[:6])), ) except Exception: news = News( title=e["title"], description=e["description"], date=make_aware(datetime(*(e["published_parsed"])[:6])), ) news.url = url news.item = self.item news.is_video = self.is_video(url) news.save() return news
[docs] class CoinManager(BaseManager): def __init__(self, item): self.item = item
[docs] def fetch(self): """ Fetches news with karpet library. Also uses blacklist from settings - NEWS_BLACKLIST. :return: Number of downloaded news. """ k = Karpet() count = 0 for n in k.fetch_news(self.item.symbol, limit=30): if News.objects.filter(url=n["url"]).exists() or self.is_on_blacklist( n["url"] ): logger.debug( f"News from URL {n['url']} already downloaded or on blacklist. Skipping ..." ) continue try: news = News( title=n["title"], description=n["description"], image=n["image"], date=n["date"], item=self.item, url=n["url"], ) news.save() except Exception: logger.exception("Couldn't create coin news.") continue count += 1 return count