Codebase list subfinder / ca015394-c5c1-40b2-afbe-a7c19330555b/upstream/2.5.5 v2 / pkg / subscraping / sources / github / github.go
ca015394-c5c1-40b2-afbe-a7c19330555b/upstream/2.5.5

Tree @ca015394-c5c1-40b2-afbe-a7c19330555b/upstream/2.5.5 (Download .tar.gz)

github.go @ca015394-c5c1-40b2-afbe-a7c19330555b/upstream/2.5.5raw · history · blame

// Package github GitHub search package
// Based on gwen001's https://github.com/gwen001/github-search github-subdomains
package github

import (
	"bufio"
	"context"
	"fmt"
	"net/http"
	"net/url"
	"regexp"
	"strconv"
	"strings"
	"time"

	jsoniter "github.com/json-iterator/go"

	"github.com/tomnomnom/linkheader"

	"github.com/projectdiscovery/gologger"
	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping"
)

type textMatch struct {
	Fragment string `json:"fragment"`
}

type item struct {
	Name        string      `json:"name"`
	HTMLURL     string      `json:"html_url"`
	TextMatches []textMatch `json:"text_matches"`
}

type response struct {
	TotalCount int    `json:"total_count"`
	Items      []item `json:"items"`
}

// Source is the passive scraping agent
type Source struct {
	apiKeys []string
}

// Run function returns all subdomains found with the service
func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Session) <-chan subscraping.Result {
	results := make(chan subscraping.Result)

	go func() {
		defer close(results)

		if len(s.apiKeys) == 0 {
			gologger.Debug().Msgf("Cannot use the '%s' source because there was no key defined for it.", s.Name())
			return
		}

		tokens := NewTokenManager(s.apiKeys)

		searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=%s&sort=created&order=asc", domain)
		s.enumerate(ctx, searchURL, domainRegexp(domain), tokens, session, results)
	}()

	return results
}

func (s *Source) enumerate(ctx context.Context, searchURL string, domainRegexp *regexp.Regexp, tokens *Tokens, session *subscraping.Session, results chan subscraping.Result) {
	select {
	case <-ctx.Done():
		return
	default:
	}

	token := tokens.Get()

	if token.RetryAfter > 0 {
		if len(tokens.pool) == 1 {
			gologger.Verbose().Label(s.Name()).Msgf("GitHub Search request rate limit exceeded, waiting for %d seconds before retry... \n", token.RetryAfter)
			time.Sleep(time.Duration(token.RetryAfter) * time.Second)
		} else {
			token = tokens.Get()
		}
	}

	headers := map[string]string{"Accept": "application/vnd.github.v3.text-match+json", "Authorization": "token " + token.Hash}

	// Initial request to GitHub search
	resp, err := session.Get(ctx, searchURL, "", headers)
	isForbidden := resp != nil && resp.StatusCode == http.StatusForbidden
	if err != nil && !isForbidden {
		results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
		session.DiscardHTTPResponse(resp)
		return
	}

	// Retry enumerarion after Retry-After seconds on rate limit abuse detected
	ratelimitRemaining, _ := strconv.ParseInt(resp.Header.Get("X-Ratelimit-Remaining"), 10, 64)
	if isForbidden && ratelimitRemaining == 0 {
		retryAfterSeconds, _ := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64)
		tokens.setCurrentTokenExceeded(retryAfterSeconds)
		resp.Body.Close()

		s.enumerate(ctx, searchURL, domainRegexp, tokens, session, results)
	}

	var data response

	// Marshall json response
	err = jsoniter.NewDecoder(resp.Body).Decode(&data)
	if err != nil {
		results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
		resp.Body.Close()
		return
	}

	resp.Body.Close()

	err = proccesItems(ctx, data.Items, domainRegexp, s.Name(), session, results)
	if err != nil {
		results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
		return
	}

	// Links header, first, next, last...
	linksHeader := linkheader.Parse(resp.Header.Get("Link"))
	// Process the next link recursively
	for _, link := range linksHeader {
		if link.Rel == "next" {
			nextURL, err := url.QueryUnescape(link.URL)
			if err != nil {
				results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
				return
			}
			s.enumerate(ctx, nextURL, domainRegexp, tokens, session, results)
		}
	}
}

// proccesItems procceses github response items
func proccesItems(ctx context.Context, items []item, domainRegexp *regexp.Regexp, name string, session *subscraping.Session, results chan subscraping.Result) error {
	for _, item := range items {
		// find subdomains in code
		resp, err := session.SimpleGet(ctx, rawURL(item.HTMLURL))
		if err != nil {
			if resp != nil && resp.StatusCode != http.StatusNotFound {
				session.DiscardHTTPResponse(resp)
			}
			return err
		}

		if resp.StatusCode == http.StatusOK {
			scanner := bufio.NewScanner(resp.Body)
			for scanner.Scan() {
				line := scanner.Text()
				if line == "" {
					continue
				}
				for _, subdomain := range domainRegexp.FindAllString(normalizeContent(line), -1) {
					results <- subscraping.Result{Source: name, Type: subscraping.Subdomain, Value: subdomain}
				}
			}
			resp.Body.Close()
		}

		// find subdomains in text matches
		for _, textMatch := range item.TextMatches {
			for _, subdomain := range domainRegexp.FindAllString(normalizeContent(textMatch.Fragment), -1) {
				results <- subscraping.Result{Source: name, Type: subscraping.Subdomain, Value: subdomain}
			}
		}
	}
	return nil
}

// Normalize content before matching, query unescape, remove tabs and new line chars
func normalizeContent(content string) string {
	normalizedContent, _ := url.QueryUnescape(content)
	normalizedContent = strings.ReplaceAll(normalizedContent, "\\t", "")
	normalizedContent = strings.ReplaceAll(normalizedContent, "\\n", "")
	return normalizedContent
}

// Raw URL to get the files code and match for subdomains
func rawURL(htmlURL string) string {
	domain := strings.ReplaceAll(htmlURL, "https://github.com/", "https://raw.githubusercontent.com/")
	return strings.ReplaceAll(domain, "/blob/", "/")
}

// DomainRegexp regular expression to match subdomains in github files code
func domainRegexp(domain string) *regexp.Regexp {
	rdomain := strings.ReplaceAll(domain, ".", "\\.")
	return regexp.MustCompile("(\\w[a-zA-Z0-9][a-zA-Z0-9-\\.]*)" + rdomain)
}

// Name returns the name of the source
func (s *Source) Name() string {
	return "github"
}

func (s *Source) IsDefault() bool {
	return false
}

func (s *Source) HasRecursiveSupport() bool {
	return false
}

func (s *Source) NeedsKey() bool {
	return true
}

func (s *Source) AddApiKeys(keys []string) {
	s.apiKeys = keys
}