init.py

Raw
"""
Tools for microformats production, consumption and analysis.

Microformats are a general way to mark up any HTML document with
classes and propeties. This module uses domain-specific assumptions
about the classes (specifically h-card, h-entry and h-event) to extract
certain interesting properties.

"""

import collections
import datetime
import re
import string
import unicodedata
from urllib.parse import urljoin

import bs4
import easyuri
from mf2py import parse

__all__ = ["parse", "representative_card"]

URL_ATTRIBUTES = {
    "a": ["href"],
    "link": ["href"],
    "img": ["src"],
    "audio": ["src"],
    "video": ["src", "poster"],
    "source": ["src"],
}

# From https://indieweb.org/location#How_to_determine_the_location_of_a_microformat
LOCATION_PROPERTIES = frozenset(
    (
        "street-address",
        "extended-address",
        "post-office-box",
        "locality",
        "region",
        "postal-code",
        "country-name",
        "label",
        "latitude",
        "longitude",
        "altitude",
        "name",
    )
)


def get_url(parsed):
    """Given a property value that may be a list of simple URLs or complex
    h-* dicts (with a url property), extract a list of URLs. This is useful
    when parsing e.g., in-reply-to.

    Args:
      mf (string or dict): URL or h-cite-style dict

    Returns:
      list: a list of URLs
    """

    urls = []
    for item in parsed["items"]:
        if isinstance(item, str):
            urls.append(item)
        elif isinstance(item, dict) and any(
            x.startswith("h-") for x in item.get("type", [])
        ):
            urls.extend(item.get("properties", {}).get("url", []))

    return urls


def find_first_entry(parsed, types):
    """Find the first interesting h-* object in BFS-order

    :param dict parsed: a mf2py parsed dict
    :param list types: target types, e.g. ['h-entry', 'h-event']
    :return: an mf2py item that is one of `types`, or None
    """
    return next(_find_all_entries(parsed, types, False), None)


def find_all_entries(parsed, types, include_properties=False):
    """Find all h-* objects of a given type in BFS-order. Traverses the
    top-level items and their children and descendents. Includes property
    values (e.g. finding all h-cards would not find values of
    "p-author h-card") only if `include_properties` is True.

    :param dict parsed: a mf2py parsed dict
    :param list types: target types, e.g. ['h-entry', 'h-event']
    :param boolean include_properties: include properties in search of entries
    :return: all entries with any of the the target types
    """
    return list(_find_all_entries(parsed, types, include_properties))


def _find_all_entries(parsed, types, include_properties):
    queue = collections.deque(item for item in parsed["items"])
    while queue:
        item = queue.popleft()
        if any(h_class in item.get("type", []) for h_class in types):
            yield item
        queue.extend(item.get("children", []))
        if include_properties:
            queue.extend(
                prop
                for props in item.get("properties", {}).values()
                for prop in props
                if isinstance(prop, dict)
            )


def find_datetimes(parsed):
    """Find published, updated, start, and end dates.

    :param dict parsed: a mf2py parsed dict
    :return: a dictionary from property type to datetime or date
    """
    hentry = find_first_entry(parsed)
    result = {}

    if hentry:
        for prop in ("published", "updated", "start", "end"):
            date_strs = hentry["properties"].get(prop, [])
            result[prop] = parse_dt(" ".join(date_strs))


def parse_dt(s):
    """The definition for microformats2 dt-* properties are fairly
    lenient.  This method converts an mf2 date string into either a
    datetime.date or datetime.datetime object. Datetimes will be naive
    unless a timezone is specified.

    :param str s: a mf2 string representation of a date or datetime
    :return: datetime.date or datetime.datetime
    :raises ValueError: if the string is not recognizable
    """

    if not s:
        return None

    s = re.sub(r"\s+", " ", s)
    date_re = r"(?P<year>\d{4,})-(?P<month>\d{1,2})-(?P<day>\d{1,2})"
    time_re = r"(?P<hour>\d{1,2}):(?P<minute>\d{2})(:(?P<second>\d{2})(\.(?P<microsecond>\d+))?)?"
    tz_re = r"(?P<tzz>Z)|(?P<tzsign>[+-])(?P<tzhour>\d{1,2}):?(?P<tzminute>\d{2})"
    dt_re = f"{date_re}((T| ){time_re} ?({tz_re})?)?$"

    m = re.match(dt_re, s)
    if not m:
        raise ValueError(f"unrecognized datetime {s}")

    year = m.group("year")
    month = m.group("month")
    day = m.group("day")

    hour = m.group("hour")

    if not hour:
        return datetime.date(int(year), int(month), int(day))

    minute = m.group("minute") or "00"
    second = m.group("second") or "00"

    if hour:
        dt = datetime.datetime(
            int(year), int(month), int(day), int(hour), int(minute), int(second)
        )
    if m.group("tzz"):
        dt = dt.replace(tzinfo=datetime.timezone.utc)
    else:
        tzsign = m.group("tzsign")
        tzhour = m.group("tzhour")
        tzminute = m.group("tzminute") or "00"

        if tzsign and tzhour:
            offset = datetime.timedelta(hours=int(tzhour), minutes=int(tzminute))
            if tzsign == "-":
                offset = -offset
            dt = dt.replace(
                tzinfo=datetime.timezone(offset, f"{tzsign}{tzhour}:{tzminute}")
            )

    return dt


def get_plain_text(values, strip=True):
    """Get the first value in a list of values that we expect to be plain-text.
    If it is a dict, then return the value of "value".

    :param list values: a list of values
    :param boolean strip: true if we should strip the plaintext value
    :return: a string or None
    """
    if values:
        v = values[0]
        if isinstance(v, dict):
            v = v.get("value", "")
        if strip:
            v = v.strip()
        return v


def classify_comment(parsed, target_urls):
    """Find and categorize comments that reference any of a collection of
    target URLs. Looks for references of type reply, like, and repost.

    :param dict parsed: a mf2py parsed dict
    :param list target_urls: a collection of urls that represent the
      target post. this can include alternate or shortened URLs.
    :return: a list of applicable comment types ['like', 'reply', 'repost']
    """

    def process_references(objs, reftypes, result):
        for obj in objs:
            if isinstance(obj, dict):
                if any(
                    url in target_urls
                    for url in obj.get("properties", {}).get("url", [])
                ):
                    result += (r for r in reftypes if r not in result)
            elif obj in target_urls:
                result += (r for r in reftypes if r not in result)

    result = []
    hentry = find_first_entry(parsed, ["h-entry"])
    if hentry:
        reply_type = []
        if "rsvp" in hentry["properties"]:
            reply_type.append("rsvp")
        if "invitee" in hentry["properties"]:
            reply_type.append("invite")
        reply_type.append("reply")

        # TODO handle rel=in-reply-to
        for prop in ("in-reply-to", "reply-to", "reply"):
            process_references(hentry["properties"].get(prop, []), reply_type, result)

        for prop in ("like-of", "like"):
            process_references(hentry["properties"].get(prop, []), ("like",), result)

        for prop in ("repost-of", "repost"):
            process_references(hentry["properties"].get(prop, []), ("repost",), result)

    return result


def parse_author(obj):
    """Parse the value of a u-author property, can either be a compound
    h-card or a single name or url.

    :param object obj: the mf2 property value, either a dict or a string
    :result: a dict containing the author's name, photo, and url
    """
    result = {}
    if isinstance(obj, dict):
        names = obj["properties"].get("name")
        photos = obj["properties"].get("photo")
        urls = obj["properties"].get("url")
        if names:
            result["name"] = names[0]
        if photos:
            result["photo"] = photos[0]
        if urls:
            result["url"] = urls[0]
    elif obj:
        if obj.startswith("http://") or obj.startswith("https://"):
            result["url"] = obj
        else:
            result["name"] = obj
    return result


def find_author(parsed, source_url=None, hentry=None, fetch_mf2_func=None):
    """Use the authorship discovery algorithm
    https://indiewebcamp.com/authorship to determine an h-entry's
    author.

    :param dict parsed: an mf2py parsed dict.
    :param str source_url: the source of the parsed document.
    :param hentry dict: optional, the h-entry we're examining, if omitted,
        we'll just use the first one
    :param fetch_mf2_func callable: optional function that takes a URL
        and returns parsed mf2
    :return: a dict containing the author's name, photo, and url
    """

    def find_hentry_author(hentry):
        for obj in hentry["properties"].get("author", []):
            return parse_author(obj)

    def find_parent_hfeed_author(hentry):
        for hfeed in _find_all_entries(parsed, ["h-feed"], False):
            # find the h-entry's parent h-feed
            if hentry in hfeed.get("children", []):
                for obj in hfeed["properties"].get("author", []):
                    return parse_author(obj)

    if not hentry:
        hentry = find_first_entry(parsed, ["h-entry"])
        if not hentry:
            return None

    author_page = None

    # 3. if the h-entry has an author property, use that
    author = find_hentry_author(hentry)

    # 4. otherwise if the h-entry has a parent h-feed with author property,
    #    use that
    if not author:
        author = find_parent_hfeed_author(hentry)

    # 5. if an author property was found
    if author:
        # 5.2 otherwise if author property is an http(s) URL, let the
        #     author-page have that URL
        if list(author.keys()) == ["url"]:
            author_page = author["url"]
        # 5.1 if it has an h-card, use it, exit.
        # 5.3 otherwise use the author property as the author name,
        #     exit.
        else:
            return author

    # 6. if there is no author-page and the h-entry's page is a permalink page
    if not author_page:
        # 6.1 if the page has a rel-author link, let the author-page's
        #     URL be the href of the rel-author link
        rel_authors = parsed.get("rels", {}).get("author", [])
        if rel_authors:
            author_page = rel_authors[0]

    # 7. if there is an author-page URL
    if author_page:
        if not fetch_mf2_func:
            return {"url": author_page}

        # 7.1 get the author-page from that URL and parse it for microformats2
        parsed = fetch_mf2_func(author_page)
        hcards = find_all_entries(parsed, ["h-card"])

        # 7.2 if author-page has 1+ h-card with url == uid ==
        #     author-page's URL, then use first such h-card, exit.
        for hcard in hcards:
            hcard_url = get_plain_text(hcard["properties"].get("url"))
            hcard_uid = get_plain_text(hcard["properties"].get("uid"))
            if (
                hcard_url
                and hcard_uid
                and hcard_url == hcard_uid
                and hcard_url == author_page
            ):
                return parse_author(hcard)

        # 7.3 else if author-page has 1+ h-card with url property
        #     which matches the href of a rel-me link on the author-page
        #     (perhaps the same hyperlink element as the u-url, though not
        #     required to be), use first such h-card, exit.
        rel_mes = parsed.get("rels", {}).get("me", [])
        for hcard in hcards:
            hcard_url = get_plain_text(hcard["properties"].get("url"))
            if hcard_url and hcard_url in rel_mes:
                return parse_author(hcard)

        # 7.4 if the h-entry's page has 1+ h-card with url ==
        #     author-page URL, use first such h-card, exit.
        for hcard in hcards:
            hcard_url = get_plain_text(hcard["properties"].get("url"))
            if hcard_url and hcard_url == author_page:
                return parse_author(hcard)

        # 8. otherwise no deterministic author can be found.
        return None


def representative_hcard(parsed, source_url):
    """Find the representative h-card for a URL

    http://microformats.org/wiki/representative-h-card-parsing

    :param dict parsed: an mf2 parsed dict
    :param str source_url: the source of the parsed document.
    :return: the representative h-card if one is found
    """
    hcards = find_all_entries(parsed, ["h-card"], include_properties=True)
    # uid and url both match source_url
    for hcard in hcards:
        if source_url in hcard["properties"].get("uid", []) and source_url in hcard[
            "properties"
        ].get("url", []):
            return hcard
    # url that is also a rel=me
    for hcard in hcards:
        if any(
            url in parsed.get("rels", {}).get("me", [])
            for url in hcard["properties"].get("url", [])
        ):
            return hcard
    # single hcard with matching url
    found = None
    count = 0
    for hcard in hcards:
        if source_url in hcard["properties"].get("url", []):
            found = hcard
            count += 1
    if count == 1:
        return found


def convert_relative_paths_to_absolute(source_url, base_href, html):
    """Attempt to convert relative paths in foreign content
    to absolute based on the source url of the document. Useful for
    displaying images or links in reply contexts and comments.

    Gets list of tags/attributes from `URL_ATTRIBUTES`. Note that this
    function uses a regular expression to avoid adding a library
    dependency on a proper parser.

    :param str source_url: the source of the parsed document.
    :param str html: the text of the source document
    :return: the document with relative urls replaced with absolute ones
    """

    def do_convert(match):
        base_url = urljoin(source_url, base_href) if base_href else source_url
        return (
            match.string[match.start(0) : match.start(1)]
            + urljoin(base_url, match.group(1))
            + match.string[match.end(1) : match.end(0)]
        )

    if source_url:
        for tagname, attributes in URL_ATTRIBUTES.items():
            for attribute in attributes:
                pattern = re.compile(
                    rf"<{tagname}[^>]*?{attribute}\s*=\s*['\"](.*?)['\"]",
                    flags=re.DOTALL | re.MULTILINE | re.IGNORECASE,
                )
                html = pattern.sub(do_convert, html)

    return html


def is_name_a_title(name, content):
    """Determine whether the name property represents an explicit title.

    Typically when parsing an h-entry, we check whether p-name ==
    e-content (value). If they are non-equal, then p-name likely
    represents a title.

    However, occasionally we come across an h-entry that does not
    provide an explicit p-name. In this case, the name is
    automatically generated by converting the entire h-entry content
    to plain text. This definitely does not represent a title, and
    looks very bad when displayed as such.

    To handle this case, we broaden the equality check to see if
    content is a subset of name. We also strip out non-alphanumeric
    characters just to make the check a little more forgiving.

    :param str name: the p-name property that may represent a title
    :param str content: the plain-text version of an e-content property
    :return: True if the name likely represents a separate, explicit title
    """

    def normalize(s):
        if not isinstance(s, str):
            s = s.decode("utf-8")
        s = unicodedata.normalize("NFKD", s)
        s = s.lower()
        s = re.sub("[" + string.whitespace + string.punctuation + "]", "", s)
        return s

    if not content:
        return True
    if not name:
        return False
    return normalize(content) not in normalize(name)


def post_type_discovery(hentry):
    """Implementation of the post-type discovery algorithm
    defined here https://indiewebcamp.com/post-type-discovery#Algorithm

    :param dict hentry: mf2 item representing the entry to test

    :return: string, one of: 'org', 'person', 'event', 'rsvp',
                     'invite', 'reply', 'repost', 'like', 'photo',
                     'article', 'note', 'follow'

    """
    props = hentry.get("properties", {})
    if "h-card" in hentry.get("type", []):
        name = get_plain_text(props.get("name"))
        org = get_plain_text(props.get("org"))
        if name and org and name == org:
            return "org"
        return "person"

    if "h-event" in hentry.get("type", []):
        return "event"

    for prop, implied_type in [
        ("rsvp", "rsvp"),
        ("invitee", "invite"),
        ("in-reply-to", "reply"),
        ("repost-of", "repost"),
        ("like-of", "like"),
        ("follow-of", "follow"),
        ("photo", "photo"),
    ]:
        if props.get(prop) is not None:
            return implied_type
    # check name ~= content
    name = get_plain_text(props.get("name"))
    content = get_plain_text(props.get("content"))
    if not content:
        content = get_plain_text(props.get("summary"))
    if content and name and is_name_a_title(name, content):
        return "article"
    return "note"


def _interpret_common_properties(
    parsed,
    source_url,
    base_href,
    hentry,
    use_rel_syndication,
    want_json,
    fetch_mf2_func,
):
    result = {}
    props = hentry["properties"]

    for prop in ("url", "uid", "photo", "featured" "logo"):
        value = get_plain_text(props.get(prop))
        if value:
            result[prop] = value

    for prop in ("start", "end", "published", "updated", "deleted"):
        date_str = get_plain_text(props.get(prop))
        if date_str:
            if want_json:
                result[prop] = date_str
            else:
                result[prop + "-str"] = date_str
                try:
                    date = parse_dt(date_str)
                    if date:
                        result[prop] = date
                except ValueError:
                    raise ValueError(f"Failed to parse datetime {date_str}")

    author = find_author(parsed, source_url, hentry, fetch_mf2_func)
    if author:
        result["author"] = author

    content_prop = props.get("content")
    content_value = None
    if content_prop:
        if isinstance(content_prop[0], dict):
            content_html = content_prop[0].get("html", "").strip()
            content_value = content_prop[0].get("value", "").strip()
        else:
            content_value = content_html = content_prop[0]
        result["content"] = convert_relative_paths_to_absolute(
            source_url, base_href, content_html
        )
        result["content-plain"] = content_value

    summary_prop = props.get("summary")
    if summary_prop:
        if isinstance(summary_prop[0], dict):
            result["summary"] = summary_prop[0]["value"]
        else:
            result["summary"] = summary_prop[0]

    # Collect location objects, then follow this algorithm to consolidate their
    # properties:
    # https://indieweb.org/location#How_to_determine_the_location_of_a_microformat
    location_stack = [props]

    for prop in "location", "adr":
        vals = props.get(prop)
        if vals:
            if isinstance(vals[0], str):
                location_stack.append({"name": vals})
            else:
                location_stack.append(vals[0].get("properties", {}))

    geo = props.get("geo")
    if geo:
        if isinstance(geo[0], dict):
            location_stack.append(geo[0].get("properties", {}))
        else:
            if geo[0].startswith("geo:"):
                # a geo: URL. try to parse it. https://tools.ietf.org/html/rfc5870
                parts = geo[0][len("geo:") :].split(";")[0].split(",")
                if len(parts) >= 2:
                    location_stack.append(
                        {
                            "latitude": [parts[0]],
                            "longitude": [parts[1]],
                            "altitude": [parts[2]] if len(parts) >= 3 else [],
                        }
                    )

    for prop in LOCATION_PROPERTIES:
        for obj in location_stack:
            if obj and obj.get(prop) and not (obj == props and prop == "name"):
                result.setdefault("location", {})[prop] = obj[prop][0]

    if use_rel_syndication:
        result["syndication"] = list(
            set(
                parsed.get("rels", {}).get("syndication", [])
                + hentry["properties"].get("syndication", [])
            )
        )
    else:
        result["syndication"] = hentry["properties"].get("syndication", [])

    return result


def interpret_event(
    parsed,
    source_url,
    base_href=None,
    hevent=None,
    use_rel_syndication=True,
    want_json=False,
    fetch_mf2_func=None,
):
    """Given a document containing an h-event, return a dictionary::

        {
         'type': 'event',
         'url': the permalink url of the document (may be different than source_url),
         'start': datetime or date,
         'end': datetime or date,
         'name': plain-text event name,
         'content': body of event description (contains HTML)
        }

    :param dict parsed: the result of parsing a document containing mf2 markup
    :param str source_url: the URL of the parsed document, not currently used
    :param str base_href: (optional) the href value of the base tag
    :param dict hevent: (optional) the item in the above document representing
      the h-event. if provided, we can avoid a redundant call to
      find_first_entry
    :param boolean use_rel_syndication: (optional, default True) Whether
      to include rel=syndication in the list of syndication sources. Sometimes
      useful to set this to False when parsing h-feeds that erroneously include
      rel=syndication on each entry.
    :param boolean want_json: (optional, default false) if true, the result
      will be pure json with datetimes as strings instead of python objects
    :param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
      output for a given URL.
    :return: a dict with some or all of the described properties
    """
    # find the h-event if it wasn't provided
    if not hevent:
        hevent = find_first_entry(parsed, ["h-event"])
        if not hevent:
            return {}

    result = _interpret_common_properties(
        parsed,
        source_url,
        base_href,
        hevent,
        use_rel_syndication,
        want_json,
        fetch_mf2_func,
    )
    result["type"] = "event"
    name_value = get_plain_text(hevent["properties"].get("name"))
    if name_value:
        result["name"] = name_value
    return result


def interpret_entry(
    parsed,
    source_url,
    base_href=None,
    hentry=None,
    use_rel_syndication=True,
    want_json=False,
    fetch_mf2_func=None,
):
    """Given a document containing an h-entry, return a dictionary::

        {
         'type': 'entry',
         'url': the permalink url of the document (may be different than source_url),
         'published': datetime or date,
         'updated': datetime or date,
         'name': title of the entry,
         'content': body of entry (contains HTML),
         'author': {
          'name': author name,
          'url': author url,
          'photo': author photo
         },
         'syndication': [
           'syndication url',
           ...
         ],
         'in-reply-to': [...],
         'like-of': [...],
         'repost-of': [...],
        }

    :param dict parsed: the result of parsing a document containing mf2 markup
    :param str source_url: the URL of the parsed document, used by the
      authorship algorithm
    :param str base_href: (optional) the href value of the base tag
    :param dict hentry: (optional) the item in the above document
      representing the h-entry. if provided, we can avoid a redundant
      call to find_first_entry
    :param boolean use_rel_syndication: (optional, default True) Whether
      to include rel=syndication in the list of syndication sources. Sometimes
      useful to set this to False when parsing h-feeds that erroneously include
      rel=syndication on each entry.
    :param boolean want_json: (optional, default False) if true, the result
      will be pure json with datetimes as strings instead of python objects
    :param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
      output for a given URL.
    :return: a dict with some or all of the described properties
    """

    # find the h-entry if it wasn't provided
    if not hentry:
        hentry = find_first_entry(parsed, ["h-entry"])
        if not hentry:
            return {}

    result = _interpret_common_properties(
        parsed,
        source_url,
        base_href,
        hentry,
        use_rel_syndication,
        want_json,
        fetch_mf2_func,
    )
    if "h-cite" in hentry.get("type", []):
        result["type"] = "cite"
    else:
        result["type"] = "entry"

    title = get_plain_text(hentry["properties"].get("name"))
    if title and is_name_a_title(title, result.get("content-plain")):
        result["name"] = title

    for prop in (
        "in-reply-to",
        "like-of",
        "repost-of",
        "bookmark-of",
        "comment",
        "like",
        "repost",
    ):
        for url_val in hentry["properties"].get(prop, []):
            if isinstance(url_val, dict):
                result.setdefault(prop, []).append(
                    interpret(
                        parsed,
                        source_url,
                        base_href,
                        url_val,
                        use_rel_syndication=False,
                        want_json=want_json,
                        fetch_mf2_func=fetch_mf2_func,
                    )
                )
            else:
                result.setdefault(prop, []).append(
                    {
                        "url": url_val,
                    }
                )

    return result


def interpret_feed(
    parsed, source_url, base_href=None, hfeed=None, want_json=False, fetch_mf2_func=None
):
    """Interpret a source page as an h-feed or as an top-level collection
    of h-entries.

    :param dict parsed: the result of parsing a mf2 document
    :param str source_url: the URL of the source document (used for authorship
        discovery)
    :param str base_href: (optional) the href value of the base tag
    :param dict hfedd: (optional) the h-feed to be parsed. If provided,
        this will be used instead of the first h-feed on the page.
    :param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
      output for a given URL.
    :return: a dict containing 'entries', a list of entries, and possibly other
        feed properties (like 'name').
    """
    result = {}
    # find the first feed if it wasn't provided
    if not hfeed:
        hfeed = find_first_entry(parsed, ["h-feed"])

    if hfeed:
        names = hfeed["properties"].get("name")
        if names:
            result["name"] = names[0]
        children = hfeed.get("children", [])
    # just use the top level 'items' as the feed children
    else:
        children = parsed.get("items", [])

    entries = []
    for child in children:
        entry = interpret(
            parsed,
            source_url,
            base_href,
            item=child,
            use_rel_syndication=False,
            want_json=want_json,
            fetch_mf2_func=fetch_mf2_func,
        )
        if entry:
            entries.append(entry)
    result["entries"] = entries
    return result


def interpret(
    parsed,
    source_url,
    base_href=None,
    item=None,
    use_rel_syndication=True,
    want_json=False,
    fetch_mf2_func=None,
):
    """Interpret a permalink of unknown type. Finds the first interesting
    h-* element, and delegates to :func:`interpret_entry` if it is an
    h-entry or :func:`interpret_event` for an h-event

    :param dict parsed: the result of parsing a mf2 document
    :param str source_url: the URL of the source document (used for authorship
      discovery)
    :param str base_href: (optional) the href value of the base tag
    :param dict item: (optional) the item to be parsed. If provided,
      this will be used instead of the first element on the page.
    :param boolean use_rel_syndication: (optional, default True) Whether
      to include rel=syndication in the list of syndication sources. Sometimes
      useful to set this to False when parsing h-feeds that erroneously include
      rel=syndication on each entry.
    :param boolean want_json: (optional, default False) If true, the result
      will be pure json with datetimes as strings instead of python objects
    :param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
      output for a given URL.
    :return: a dict as described by interpret_entry or interpret_event, or None
    """
    if not item:
        item = find_first_entry(parsed, ["h-entry", "h-event"])

    if item:
        types = item.get("type", [])
        if "h-event" in types:
            return interpret_event(
                parsed,
                source_url,
                base_href=base_href,
                hevent=item,
                use_rel_syndication=use_rel_syndication,
                want_json=want_json,
                fetch_mf2_func=fetch_mf2_func,
            )
        elif "h-entry" in types or "h-cite" in types:
            return interpret_entry(
                parsed,
                source_url,
                base_href=base_href,
                hentry=item,
                use_rel_syndication=use_rel_syndication,
                want_json=want_json,
                fetch_mf2_func=fetch_mf2_func,
            )


def interpret_comment(
    parsed,
    source_url,
    target_urls,
    base_href=None,
    want_json=False,
    fetch_mf2_func=None,
):
    """Interpret received webmentions, and classify as like, reply, or
    repost (or a combination thereof). Returns a dict as described
    in :func:`interpret_entry`, with the additional fields::

        {
         'comment_type': a list of strings, zero or more of
                         'like', 'reply', or 'repost'
         'rsvp': a string containing the rsvp response (optional)
        }

    :param dict parsed: a parsed mf2 parsed document
    :param str source_url: the URL of the source document
    :param list target_urls: a collection containing the URL of the target\
      document, and any alternate URLs (e.g., shortened links) that should\
      be considered equivalent when looking for references
    :param str base_href: (optional) the href value of the base tag
    :param boolean want_json: (optional, default False) If true, the result
      will be pure json with datetimes as strings instead of python objects
    :param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
      output for a given URL.
    :return: a dict as described above, or None
    """
    item = find_first_entry(parsed, ["h-entry"])
    if item:
        result = interpret_entry(
            parsed,
            source_url,
            base_href=base_href,
            hentry=item,
            want_json=want_json,
            fetch_mf2_func=fetch_mf2_func,
        )
        if result:
            result["comment_type"] = classify_comment(parsed, target_urls)
            rsvp = get_plain_text(item["properties"].get("rsvp"))
            if rsvp:
                result["rsvp"] = rsvp.lower()

            invitees = item["properties"].get("invitee")
            if invitees:
                result["invitees"] = [parse_author(inv) for inv in invitees]

        return result


# ===========================================================================


stable = {
    "adr": [
        "p-street-address",
        "p-extended-address",
        "p-post-office-box",
        "p-locality",
        "p-region",
        "p-postal-code",
        "p-country-name",
        "p-label",
        "p/u-geo",
        "p-latitude",
        "p-longitude",
        "p-altitude",
    ],
    "card": [
        "p-name",
        "p-honorific-prefix",
        "p-given-name",
        "p-additional-name",
        "p-family-name",
        "p-sort-string",
        "p-honorific-suffix",
        "p-nickname",
        "u-email",
        "u-logo",
        "u-photo",
        "u-url",
        "u-uid",
        "p-category",
        "p/h-adr",
        "p-post-office-box",
        "p-extended-address",
        "p-street-address",
        "p-locality",
        "p-region",
        "p-postal-code",
        "p-country-name",
        "p-label",
        "p/u/h-geo",
        "p-latitude",
        "p-longitude",
        "p-altitude",
        "p-tel",
        "p-note",
        "dt-bday",
        "u-key",
        "p-org",
        "p-job-title",
        "p-role",
        "u-impp",
        "p-sex",
        "p-gender-identity",
        "dt-anniversary",
    ],
    "entry": [
        "p-name",
        "p-summary",
        "e-content",
        "dt-published",
        "dt-updated",
        "p-author",
        "p-category",
        "u-url",
        "u-uid",
        "p-location",
        "u-syndication",
        "u-in-reply-to",
        "p-rsvp",
        "u-like-of",
        "u-repost-of",
    ],
    "event": [
        "p-name",
        "p-summary",
        "dt-start",
        "dt-end",
        "dt-duration",
        "e-content",
        "u-url",
        "p-category",
        "p-location(card/adr/geo)",
        "[p-attendee]",
    ],
    "feed": ["p-name", "p-author(card)", "u-url", "u-photo"],
    "geo": ["p-latitude", "p-longitude", "p-altitude"],
    "item": ["p-name", "u-url", "u-photo"],
    "product": [
        "p-name",
        "u-photo",
        "p-brand(card)",
        "p-category",
        "e-content",
        "u-url",
        "u-identifier",
        "p-review(review)",
        "p-price",
    ],
    "recipe": [
        "p-name",
        "p-ingredient",
        "p-yield",
        "e-instructions",
        "dt-duration",
        "u-photo",
        "p-summary",
        "p-author(card)",
        "dt-published",
        "p-nutrition",
        "p-category",
    ],
    "resume": [
        "p-name",
        "p-summary",
        "p-contact",
        "p-education(event+card)",
        "p-experience(event+card)",
        "p-skill",
        "p-affiliation",
    ],
    "review": [
        "p-name ",
        "p-item(card/event/adr/geo/product/item)",
        "p-author(card)",
        "dt-published",
        "p-rating",
        "p-best",
        "p-worst",
        "e-content",
        "p-category",
        "u-url",
    ],
    "review-aggregate": [
        "p-item(card/event/adr/geo/product/item)",
        "p-average",
        "p-best",
        "p-worst",
        "p-count",
        "p-votes",
        "p-name",
    ],
}
draft = {"app": ["p-name", "u-url", "u-logo", "u-photo"]}


def representative_card(mf2json: dict, source_url: str) -> dict:
    """
    Return the representative card for given parsed document.

    http://microformats.org/wiki/representative-h-card-parsing

    """
    source = easyuri.parse(source_url).minimized
    cards = [
        card
        for card in _get_all_items(mf2json, ["h-card"], include_props=True)
        if (
            card["properties"].get("name", [""])[0]
            or card["properties"].get("nickname", [""])[0]
        )
    ]
    if match := _check_uid_and_url_match_source_url(cards, source):
        return match
    if match := _check_url_matches_rel_me(cards, mf2json):
        return match
    if match := _check_url_matches_source_url(cards, source):
        return match
    return {}


def _check_uid_and_url_match_source_url(cards, source_url):  # FIXME same as below?
    """"""
    for card in cards:
        if source_url in _get_normalized_urls(
            card, "uid"
        ) and source_url in _get_normalized_urls(card, "url"):
            return card["properties"]


def _check_url_matches_rel_me(cards, parsed):
    """"""
    for card in cards:
        rel_mes = set()
        for rel_me in parsed.get("rels", {}).get("me", []):
            try:
                rel_me = easyuri.parse(rel_me)
            except ValueError:
                continue
            if isinstance(rel_me, (easyuri.HTTPURI, easyuri.HTTPSURI)):
                rel_mes.add(rel_me.minimized)
        if any(url in rel_mes for url in _get_normalized_urls(card, "url")):
            return card["properties"]


def _check_url_matches_source_url(cards, source_url):  # FIXME same as above?
    """"""
    found = []
    count = 0
    for card in cards:
        # if source_url in card['properties'].get('url', []):
        for card_url in _get_normalized_urls(card, "url"):
            if card_url.rstrip("/") == source_url:
                found.append(card)
                count += 1
    if count:
        return found[0]["properties"]


def representative_feed(mf2json: dict, source_url: str, source_dom=None):
    """
    Return the representative feed for given parsed document.

    https://indieweb.org/feed#How_To_Consume
    https://microformats.org/wiki/h-feed#Discovery

    """
    feed = {}
    try:
        feed["name"] = source_dom.select("title")[0].text
    except (AttributeError, IndexError):
        pass
    if author := representative_card(mf2json, source_url):
        feed["author"] = author
    items = []
    if first_feed := _get_first_item(mf2json, ["h-feed"]):
        if name := first_feed["properties"].get("name"):
            feed["name"] = [name]
        if authors := first_feed["properties"].get("author"):
            feed["author"] = []
            for author in authors:
                author["properties"]["type"] = author["type"]
                feed["author"].append(author["properties"])
        if children := first_feed["children"]:
            items = children
    else:
        items = _get_all_items(mf2json, ["h-entry", "h-event"])
    feed["items"] = []
    for item in items:
        if item.get("source") == "metaformats":
            continue
        item["properties"]["type"] = item["type"]
        feed["items"].append(item["properties"])
    if rel_next := mf2json["rels"].get("next"):
        feed["next"] = rel_next[0]
    if rel_prev := mf2json["rels"].get("prev"):
        feed["prev"] = rel_prev[0]
    return feed


def discover_post_type(properties):
    """
    Return the discovered post type.

    http://ptd.spec.indieweb.org/#x5-post-type-algorithm

    """
    type_specific_properties = {
        "rsvp": "rsvp",
        "repost-of": "repost",  # aka share
        "like-of": "like",  # aka favorite
        "in-reply-to": "reply",
        "listen-of": "listen",
        "bookmark-of": "bookmark",
        "checkin": "check-in",
        "video": "video",
        "audio": "audio",
        "photo": "photo",
        # TODO "checkin": "checkin",
        # TODO "bookmark-of": "bookmark",
        # TODO "follow-of": "follow",
        # TODO "weight": "weight",
    }
    for type_specific_property, post_type in type_specific_properties.items():
        if type_specific_property in properties:
            if (
                post_type in ("video", "audio", "photo")
                and "quotation-of" in properties
            ):
                return f"{post_type}/clip"
            return post_type
    content = ""
    try:
        content = _get_first_non_empty(properties["content"])
    except KeyError:
        try:
            content = _get_first_non_empty(properties["summary"])
        except KeyError:
            return "note"
    name = ""
    try:
        name = _get_first_non_empty(properties["name"])
    except KeyError:
        return "note"
    if name:
        try:
            content = dict(content)
        except ValueError:
            text_content = content
        else:
            text_content = bs4.BeautifulSoup(content["html"].strip()).text
        if not text_content.startswith(name):
            return "article"
    return "note"


def _get_first_item(mf2json: dict, item_type: set):
    """Return the first object(s) of given item_type(s) (eg. h-entry, h-event)."""
    return next(_yield_all_items(mf2json, item_type, False), None)


def _get_all_items(mf2json: dict, item_type: set, include_props=False):
    """Return all object(s) of given item_type(s) (eg. h-entry, h-event)."""
    return list(_yield_all_items(mf2json, item_type, include_props))


def _yield_all_items(mf2json: dict, item_type: set, include_props: bool):
    """
    Yield objects(s) of given item_type(s) in breadth first search.

    Traverses the top-level items and their children and descendents.
    Includes property values (e.g. finding all h-cards would not find
    values of "p-author h-card") only if `include_props` is True.

    """
    queue = collections.deque(item for item in mf2json["items"])
    while queue:
        item = queue.popleft()
        if any(h_class in item.get("type", []) for h_class in item_type):
            yield item
        queue.extend(item.get("children", []))
        if include_props:
            queue.extend(
                prop
                for props in item.get("properties", {}).values()
                for prop in props
                if isinstance(prop, dict)
            )


def _get_normalized_urls(card, prop):
    """Return a list of normalized URLs for an card's prop (uid/url)."""
    urls = []
    for url in card["properties"].get(prop, []):
        try:
            urls.append(easyuri.parse(url).minimized)
        except ValueError:
            pass
    return urls


def _get_first_non_empty(propval):
    """
    Return the first non-empty value in `propval`.

    If `propval` is not a list and non-empty, return it.

    """
    if not isinstance(propval, list):
        propval = [propval]
    for content in propval:
        if content:
            return content
__init__.py

init.py