Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for graphdb #174

Draft
wants to merge 13 commits into
base: dev
Choose a base branch
from
1 change: 1 addition & 0 deletions cmd/katana/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ pipelines offering both headless and non-headless crawling.`)
flagSet.BoolVar(&options.Silent, "silent", false, "display output only"),
flagSet.BoolVarP(&options.Verbose, "verbose", "v", false, "display verbose output"),
flagSet.BoolVar(&options.Version, "version", false, "display project version"),
flagSet.StringVarP(&options.OutputGraph, "output-graph", "og", "", "graph folder (one graph per URL will be created)"),
)

if err := flagSet.Parse(); err != nil {
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ go 1.18

require (
github.com/PuerkitoBio/goquery v1.8.0
github.com/dominikbraun/graph v0.14.0
github.com/go-rod/rod v0.112.3
github.com/json-iterator/go v1.1.12
github.com/logrusorgru/aurora v2.0.3+incompatible
github.com/lukasbob/srcset v0.0.0-20190730101422-86b742e617f3
github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6
github.com/pkg/errors v0.9.1
github.com/projectdiscovery/fastdialer v0.0.20
github.com/projectdiscovery/goflags v0.1.6
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi/U=
github.com/dimchansky/utfbom v1.1.1/go.mod h1:SxdoEBH5qIqFocHMyGOXVAybYJdr71b1Q/j0mACtrfE=
github.com/dominikbraun/graph v0.14.0 h1:Q1q7OQIKMPDQVNkwRhWQ5BUxCGM1tkcISH5sY6yNj+8=
github.com/dominikbraun/graph v0.14.0/go.mod h1:yOjYyogZLY1LSG9E33JWZJiq5k83Qy2C6POAuiViluc=
github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q=
github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
Expand Down Expand Up @@ -77,6 +79,8 @@ github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
github.com/lukasbob/srcset v0.0.0-20190730101422-86b742e617f3 h1:l1rIRmxNhzeQM+qA3D0CsDLo0Hx45q9JmK0BlCjt6Ks=
github.com/lukasbob/srcset v0.0.0-20190730101422-86b742e617f3/go.mod h1:j16TYl5p17+vBMyaL6Nu4ojlOnfX8lc2k2cfmw6m5TQ=
github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 h1:bjfMeqxWEJ6IRUvGkiTkSwx0a6UdQJsbirRSoXogteY=
github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6/go.mod h1:WVJJvUw/pIOcwu2O8ZzHEhmigq2jzwRNfJVRMJB7bR8=
github.com/mholt/archiver v3.1.1+incompatible h1:1dCVxuqs0dJseYEhi5pl7MYPH9zDa1wBi7mF09cbNkU=
github.com/mholt/archiver v3.1.1+incompatible/go.mod h1:Dh2dOXnSdiLxRiPoVfIr/fI1TwETms9B8CTWfeh7ROU=
github.com/microcosm-cc/bluemonday v1.0.21 h1:dNH3e4PSyE4vNX+KlRGHT5KrSvjeUkoNPwEORjffHJg=
Expand Down
35 changes: 17 additions & 18 deletions pkg/engine/hybrid/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package hybrid

import (
"bytes"
"context"
"io"
"net/http"
"net/url"
Expand All @@ -14,13 +13,10 @@ import (
"github.com/go-rod/rod/lib/proto"
"github.com/pkg/errors"
"github.com/projectdiscovery/gologger"
"github.com/projectdiscovery/katana/pkg/engine/parser"
"github.com/projectdiscovery/katana/pkg/navigation"
"github.com/projectdiscovery/katana/pkg/utils/queue"
"github.com/projectdiscovery/retryablehttp-go"
)

func (c *Crawler) navigateRequest(ctx context.Context, httpclient *retryablehttp.Client, queue *queue.VarietyQueue, parseResponseCallback func(nr navigation.Request), browser *rod.Browser, request navigation.Request, rootHostname string) (*navigation.Response, error) {
func (c *Crawler) navigateRequest(parseResponseCallback func(nr navigation.Request), browser *rod.Browser, request navigation.Request, rootHostname string, crawlerGraph *navigation.Graph) ([]*navigation.Response, error) {
depth := request.Depth + 1
response := &navigation.Response{
Depth: depth,
Expand All @@ -34,6 +30,7 @@ func (c *Crawler) navigateRequest(ctx context.Context, httpclient *retryablehttp
}
defer page.Close()

var asyncronousResponses []*navigation.Response
pageRouter := NewHijack(page)
pageRouter.SetPattern(&proto.FetchRequestPattern{
URLPattern: "*",
Expand Down Expand Up @@ -66,18 +63,20 @@ func (c *Crawler) navigateRequest(ctx context.Context, httpclient *retryablehttp
}

bodyReader, _ := goquery.NewDocumentFromReader(bytes.NewReader(body))
resp := navigation.Response{
resp := &navigation.Response{
Resp: httpresp,
Body: []byte(body),
Reader: bodyReader,
Options: c.options,
Depth: depth,
RootHostname: rootHostname,
}
_ = resp

asyncronousResponses = append(asyncronousResponses, resp)

// process the raw response
parser.ParseResponse(resp, parseResponseCallback)
// parser.ParseResponse(*resp, parseResponseCallback)

return FetchContinueRequest(page, e)
})() //nolint
defer func() {
Expand Down Expand Up @@ -125,26 +124,26 @@ func (c *Crawler) navigateRequest(ctx context.Context, httpclient *retryablehttp
response.Resp = &http.Response{Header: make(http.Header), Request: &http.Request{URL: parsed}}

// Create a copy of intrapolated shadow DOM elements and parse them separately
responseCopy := *response
responseCopy.Body = []byte(builder.String())
if !c.options.UniqueFilter.UniqueContent(responseCopy.Body) {
return &navigation.Response{}, nil
responseShadowDom := *response
responseShadowDom.Body = []byte(builder.String())
if !c.options.UniqueFilter.UniqueContent(responseShadowDom.Body) {
return nil, nil
}

responseCopy.Reader, _ = goquery.NewDocumentFromReader(bytes.NewReader(responseCopy.Body))
if responseCopy.Reader != nil {
parser.ParseResponse(responseCopy, parseResponseCallback)
}
responseShadowDom.Reader, _ = goquery.NewDocumentFromReader(bytes.NewReader(responseShadowDom.Body))

response.Body = []byte(body)
if !c.options.UniqueFilter.UniqueContent(response.Body) {
return &navigation.Response{}, nil
return nil, nil
}
response.Reader, err = goquery.NewDocumentFromReader(bytes.NewReader(response.Body))
if err != nil {
return nil, errors.Wrap(err, "could not parse html")
}
return response, nil

responses := []*navigation.Response{response, &responseShadowDom}

return append(responses, asyncronousResponses...), nil
}

// traverseDOMNode performs traversal of node completely building a pseudo-HTML
Expand Down
48 changes: 40 additions & 8 deletions pkg/engine/hybrid/hybrid.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
package hybrid

import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"sync/atomic"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/go-rod/rod"
"github.com/go-rod/rod/lib/launcher"
"github.com/go-rod/rod/lib/launcher/flags"
Expand Down Expand Up @@ -186,6 +184,15 @@ func (c *Crawler) Crawl(rootURL string) error {
}
}

var crawlerGraph *navigation.Graph
if c.options.Options.OutputGraph != "" {
var err error
crawlerGraph, err = navigation.NewGraph()
if err != nil {
return err
}
}

wg := sizedwaitgroup.New(c.options.Options.Concurrency)
running := int32(0)
for {
Expand Down Expand Up @@ -217,20 +224,45 @@ func (c *Crawler) Crawl(rootURL string) error {
if c.options.Options.Delay > 0 {
time.Sleep(time.Duration(c.options.Options.Delay) * time.Second)
}
resp, err := c.navigateRequest(ctx, httpclient, queue, parseResponseCallback, newBrowser, req, hostname)

// responses contains:
// index 0 => primary syncronous node
// indexes 1..n => secondary asyncronous nodes
responses, err := c.navigateRequest(ctx, httpclient, queue, parseResponseCallback, incognitoBrowser, req, hostname, crawlerGraph)
if err != nil {
gologger.Warning().Msgf("Could not request seed URL: %s\n", err)
return
}
if resp == nil || resp.Resp == nil && resp.Reader == nil {
return

for idx, resp := range responses {
if resp == nil || resp.Resp == nil && resp.Reader == nil {
return
}

if crawlerGraph != nil {
resp.State, _ = crawlerGraph.AddState(req, *resp, resp.Resp.Request.URL.String())
// the web state for response zero becomes the root for asyncronous requests
if idx == 0 {
req.State = resp.State
}
}

// process the dom-rendered response
parser.ParseResponse(*resp, parseResponseCallback)
}
// process the dom-rendered response
parser.ParseResponse(*resp, parseResponseCallback)
}()
}

wg.Wait()

if crawlerGraph != nil {
// use the domain name as filename
outputFile := filepath.Join(c.options.Options.OutputGraph, hostname)
if err := crawlerGraph.ExportTo(outputFile); err != nil {
return err
}
}

return nil
}

Expand Down
23 changes: 23 additions & 0 deletions pkg/engine/standard/standard.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"io"
"net/http"
"net/url"
"path/filepath"
"sync/atomic"
"time"

Expand Down Expand Up @@ -85,6 +86,15 @@ func (c *Crawler) Crawl(rootURL string) error {
return errors.Wrap(err, "could not create http client")
}

var crawlerGraph *navigation.Graph
if c.options.Options.OutputGraph != "" {
var err error
crawlerGraph, err = navigation.NewGraph()
if err != nil {
return err
}
}

wg := sizedwaitgroup.New(c.options.Options.Concurrency)
running := int32(0)
for {
Expand Down Expand Up @@ -124,11 +134,24 @@ func (c *Crawler) Crawl(rootURL string) error {
if resp.Resp == nil || resp.Reader == nil {
return
}

if crawlerGraph != nil {
resp.State, _ = crawlerGraph.AddState(req, resp, req.URL)
}

parser.ParseResponse(resp, parseResponseCallback)
}()
}
wg.Wait()

if crawlerGraph != nil {
// use the domain name as filename
outputFile := filepath.Join(c.options.Options.OutputGraph, hostname)
if err := crawlerGraph.ExportTo(outputFile); err != nil {
return err
}
}

return nil
}

Expand Down
70 changes: 70 additions & 0 deletions pkg/navigation/content.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package navigation

import (
"fmt"
)

// ContentType represent the nature of page content
type ContentType uint8

// Types of content type
const (
Core ContentType = iota
Dynamic
)

// TagType represents the tag type
type TagType uint8

const (
StartTag TagType = iota
EndTag
SelfClosingTag
Doctype
Comment
Text
)

func (t TagType) String() string {
switch t {
case StartTag:
return "ST"
case EndTag:
return "ET"
case Doctype:
return "D"
case Comment:
return "C"
case Text:
return "T"
default:
return ""
}
}

const (
TextHtml string = "text/html"
)

type Attribute struct {
Name string
Value string
Namespace string
}

type Content struct {
TagType TagType
Type ContentType
Data string
Short string
Attributes []Attribute
}

func (c *Content) IDs() (ids []string) {
for _, attribute := range c.Attributes {
id := fmt.Sprintf("A:%s:%s", attribute.Name, attribute.Value)
ids = append(ids, id)
}
ids = append(ids, fmt.Sprintf("%s:%s", c.TagType.String(), c.Data))
return
}
Loading