Codebase list zonedb / bc6f85f internal / build / iana_idn_tables.go
bc6f85f

Tree @bc6f85f (Download .tar.gz)

iana_idn_tables.go @bc6f85fraw · history · blame

package build

import (
	"errors"
	"net/url"
	"strings"
	"sync/atomic"

	"github.com/PuerkitoBio/goquery"
)

const (
	// I override the URL during dev.
	ianaTablesURL = "https://www.iana.org/domains/idn-tables"
	ianaBaseURL   = "https://www.iana.org"
)

// FetchIDNTablesFromIANA fetches IDN table references from the IANA website.
func FetchIDNTablesFromIANA(zones map[string]*Zone) error {
	tlds := TLDs(zones)
	baseURL, err := url.Parse(ianaBaseURL)
	if err != nil {
		return err
	}
	res, err := Fetch(ianaTablesURL)
	if err != nil {
		return err
	}
	defer res.Body.Close()
	doc, err := goquery.NewDocumentFromReader(res.Body)
	if err != nil {
		return err
	}

	var (
		matchCount     uint64
		extractedCount uint64
	)
	doc.Find("body table#idn-table > tbody > tr > td:first-child").Each(func(i int, s *goquery.Selection) {
		atomic.AddUint64(&matchCount, 1)
		domain := s.Find("span").Text()
		forLabel := s.Find("a").Text()
		if forLabel == "" {
			forLabel = domain
		}
		partURL, exists := s.Find("a[href]").Attr("href")
		if !exists {
			forLabel := s.Find("a").Text()
			if forLabel == "" {
				forLabel = domain
			}
			Trace("@{r}missing href for %q\n", forLabel)
			return
		}
		u, err := baseURL.Parse(partURL)
		if err != nil {
			Trace("@{r}failed to parse %q for %q\n", partURL, forLabel)
			return
		}

		// At this point, "domain" looks like ".<tld>" and u should have u.String() which is an absolute working URL
		// The partURL's last component, after directory-separator, looks like "<tld>_<language>_<version.info>.txt"
		if domain[0] != '.' {
			Trace("@{r}bad domain name %q\n", domain)
			return
		}
		domain = domain[1:] // trim dot
		z, ok := zones[domain]
		if !ok {
			if len(tlds) > 100 {
				Trace("@{r}unknown zone %q from %s\n", domain, ianaBaseURL+partURL)
			}
			return
		}

		lang, err := langFromURL(partURL)
		if err != nil {
			Trace("@{y}unable to extract language tag or script for zone %q from %s\n", domain, partURL)
			return
		}

		z.AddPolicy(TypeIDNTable, lang, u.String(), "") // "Fetched from "+ianaTablesURL)

		atomic.AddUint64(&extractedCount, 1)
	})
	Trace("@{.}saw %d matches, extracted %d entries\n", matchCount, extractedCount)
	if extractedCount == 0 && len(tlds) > 100 {
		return errors.New("failed to extract any URLs from IANA index page, HTML change?")
	}

	// TODO: Do we want to _remove_ URLs if zone not found here?
	//       How do we handle multiple sources of URLs if so?

	return nil
}

func langFromURL(u string) (string, error) {
	i := strings.LastIndexByte(u, '/')
	if i == -1 {
		return "", errMalformedURL
	}
	sections := strings.Split(u[i+1:], "_")
	if len(sections) < 3 {
		return "", errMalformedURL
	}
	// Checking sections[0] against zone from caller doesn’t work because:
	//  1. IDN TLDs using other strings
	//  2. table sharing between zones (eg, “academy” appears as baseline for many)
	// This does tell us that we want to have a cache of values on a per-URL basis, to avoid fetching the same
	// URL N times.
	lang := sections[1]
	return normalizeLang(lang)
}

var errMalformedURL = errors.New("malformed IDN table URL")