Source code for richy.news.downloader

import logging
import re
from datetime import datetime
from urllib.parse import urlparse

import feedparser
import requests
from django.conf import settings
from django.utils.timezone import make_aware
from karpet import Karpet
from metadata_parser import MetadataParser

from ..core.scraper import Downloader
from ..news.models import News

logger = logging.getLogger(__name__)



[docs]
class BaseManager:

[docs]
    def is_on_blacklist(self, url):
        """
        Checks if URL is on blacklist (NEWS_BLACKLIST).
        Check is based on server name ("www." is stripped off if found).

        :param url: URL to be checked.
        :return: Boolean - True if on the list False otherwise.
        """

        r = urlparse(url)

        # Normalize server URL.
        if r.netloc.startswith("www."):
            server = r.netloc[4:]
        else:
            server = r.netloc

        # Check for blacklist.
        if server in settings.NEWS_BLACKLIST:
            return True



[docs]
    def is_video(self, url):
        if re.match("^http(s)?://finance.yahoo.com/video/", url):
            return True

        return False





[docs]
class ShareIndexEtfManager(BaseManager):
    def __init__(self, item):
        self.item = item


[docs]
    def fetch(self):
        """
        Fetches news for self.item.
        Uses YAHOO's RSS feed (NEWS_FEED) and for each
        item in the feed tries to download metadata inside
        self.create_news().

        Also uses blacklist from settings - NEWS_BLACKLIST.

        :return: Number of downloaded news.
        """

        # Fetches the feed.
        session = Downloader.get_client()
        resp = session.get(
            settings.NEWS_FEED.format(
                f"^{self.item.symbol}"
                if hasattr(self.item, "index")
                else self.item.symbol
            ),
            timeout=(5, 15),
        )
        resp.raise_for_status()
        feed = feedparser.parse(resp.content)
        count = 0

        for e in feed["entries"]:
            # Parses the target news URL out of YAHOO redirect script.
            try:
                url = self.parse_target(e["link"])
            except Exception:
                logger.error(
                    "Skipping {} because of error.".format(e["link"]), exc_info=True
                )
                continue

            # Try to find already downloaded news (based on the URL).
            # If found -> skip.
            if News.objects.filter(url=url).exists() or self.is_on_blacklist(url):
                logger.debug(
                    "News from URL {} already downloaded or on blacklist. Skipping ...".format(
                        url
                    )
                )
                continue

            logger.debug("Downloading {}.".format(url))

            # Perform metadata fetch.
            try:
                news = self.create_news(url, e)
            except Exception:
                logger.exception("Couldn't create share/index/etf news.")

                continue

            logger.debug(
                "News from URL {} for {} has been downloaded and saved as news {}.".format(
                    e["link"], self.item.symbol, news.pk
                )
            )
            count += 1

        return count



[docs]
    def parse_target(self, url):
        """
        If given URL is YAHOO's redirect URL
        tries to fetch the target URL from the script that
        is returned by YAHOO on origin URL.

        If not return origin URL.

        :param url: RSS feed URL.
        :return: Target URL.
        """

        # If URL is not YAHOO's redirect url, return it.
        if not re.match("^http(s)?://finance.yahoo.com/r/", url):
            return url

        # Fetch Javascript where is the hidden URL.
        response = requests.get(url, timeout=Downloader.DEFAULT_TIMEOUT)
        response.content.decode("utf-8")

        # Parse out URL.
        return re.search(
            "URL=\\'(.+)\\'", response.content.decode("utf-8"), re.DOTALL
        ).group(1)



[docs]
    def create_news(self, url, e):
        """
        Tries to parse our metadata and save them to the news.
        If no metadata or an exception occurred, RSS data are used.

        :param url: News URL.
        :param e: RSS feed item data.
        :return: News model instance - the newly created news.
        """

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
        }

        try:
            page = MetadataParser(
                url,
                search_head_only=False,
                url_headers=headers,
                requests_timeout=Downloader.DEFAULT_TIMEOUT,
            )
            news = News(
                title=(
                    page.get_metadatas("title")[0]
                    if page.get_metadatas("title")
                    else e["title"]
                ),
                description=(
                    page.get_metadatas("description")[0]
                    if page.get_metadatas("description")
                    else e["description"]
                ),
                image=(
                    page.get_metadatas("image")[0]
                    if page.get_metadatas("image")
                    else None
                ),
                date=make_aware(datetime(*(e["published_parsed"])[:6])),
            )
        except Exception:
            news = News(
                title=e["title"],
                description=e["description"],
                date=make_aware(datetime(*(e["published_parsed"])[:6])),
            )

        news.url = url
        news.item = self.item
        news.is_video = self.is_video(url)

        news.save()

        return news





[docs]
class CoinManager(BaseManager):
    def __init__(self, item):
        self.item = item


[docs]
    def fetch(self):
        """
        Fetches news with karpet library.
        Also uses blacklist from settings - NEWS_BLACKLIST.

        :return: Number of downloaded news.
        """

        k = Karpet()
        count = 0

        for n in k.fetch_news(self.item.symbol, limit=30):
            if News.objects.filter(url=n["url"]).exists() or self.is_on_blacklist(
                n["url"]
            ):
                logger.debug(
                    f"News from URL {n['url']} already downloaded or on blacklist. Skipping ..."
                )

                continue

            try:
                news = News(
                    title=n["title"],
                    description=n["description"],
                    image=n["image"],
                    date=n["date"],
                    item=self.item,
                    url=n["url"],
                )
                news.save()
            except Exception:
                logger.exception("Couldn't create coin news.")

                continue

            count += 1

        return count