Skip to content

Commit 67085eb

Browse files
committed
Added skeleton of the main program
1 parent bc285a3 commit 67085eb

7 files changed

Lines changed: 144 additions & 36 deletions

File tree

BUILD.bazel

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@ go_library(
1010
importpath = "github.com/its-my-data/doubak",
1111
visibility = ["//visibility:private"],
1212
deps = [
13-
"//collector",
1413
"//proto",
15-
"@com_github_gocolly_colly_v2//:colly",
14+
"//task",
1615
],
1716
)
1817

collector/BUILD.bazel

Lines changed: 0 additions & 9 deletions
This file was deleted.

collector/collector.go

Lines changed: 0 additions & 13 deletions
This file was deleted.

doubak.go

Lines changed: 76 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
package main
22

33
import (
4+
"errors"
45
"flag"
5-
"fmt"
6-
"github.com/gocolly/colly/v2"
7-
"github.com/its-my-data/doubak/collector"
86
p "github.com/its-my-data/doubak/proto"
7+
"github.com/its-my-data/doubak/task"
8+
"log"
99
"math"
10+
"regexp"
11+
"strings"
1012
"time"
1113
)
1214

@@ -34,17 +36,79 @@ var requestDelay = flag.Duration(p.Flag_req_delay.String(), defaultRequestDelay,
3436
"Min time between any two requests, used to reduce server load. This may "+
3537
"be replaced by a QPS flag when proxy pool and parallel requests are implemented.")
3638

39+
func validateFlags() (tasks []string, categories []string, err error) {
40+
spaceRegex := regexp.MustCompile(`\s`)
41+
42+
// Validate task list (order matters).
43+
strippedTasks := spaceRegex.ReplaceAllString(*tasksToRun, "")
44+
tasks = strings.Split(strippedTasks, ",")
45+
for _, t := range tasks {
46+
if _, ok := p.Task_value[t]; !ok {
47+
err = errors.New("unknown task name: " + t)
48+
return
49+
}
50+
}
51+
52+
// Validate category list (order doesn't matter).
53+
strippedCategories := spaceRegex.ReplaceAllString(*targetCategories, "")
54+
categories = strings.Split(strippedCategories, ",")
55+
for _, c := range categories {
56+
if _, ok := p.Category_value[c]; !ok {
57+
err = errors.New("unknown category name: " + c)
58+
return
59+
}
60+
}
61+
62+
return
63+
}
64+
3765
func main() {
3866
flag.Parse()
3967

40-
collector.Collect()
68+
// Precheck flags that need preprosessing.
69+
log.Print("Validating flags... ")
70+
tasks, categories, parseErr := validateFlags()
71+
if parseErr != nil {
72+
log.Print("FAILED")
73+
log.Fatal(parseErr)
74+
} else {
75+
log.Print("PASS")
76+
}
77+
78+
// Create selected tasks.
79+
taskMap := map[string]task.BaseInterface{}
80+
for _, t := range tasks {
81+
var taskImpl task.BaseInterface
82+
switch t {
83+
case p.Task_collect.String():
84+
taskImpl = task.NewCollector(categories)
85+
86+
// TODO: add other tasks.
87+
// case p.Task_parse:
88+
// case p.Task_publish:
89+
}
90+
taskMap[t] = taskImpl
91+
}
92+
93+
// Run the specific tasks' prechecks first.
94+
for taskName, t := range taskMap {
95+
log.Printf("Prechecking \"%s\"... ", taskName)
96+
if err := t.Precheck(); err != nil {
97+
log.Print("FAILED")
98+
log.Fatal(err)
99+
} else {
100+
log.Print("PASS")
101+
}
102+
}
41103

42-
c := colly.NewCollector()
43-
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
44-
fmt.Println("Found ULR: ", e.Attr("href"))
45-
})
46-
c.OnRequest(func(r *colly.Request) {
47-
fmt.Println("Visiting", r.URL)
48-
})
49-
c.Visit("http://douban.com/")
104+
// Execute the tasks in input order.
105+
for _, taskName := range tasks {
106+
log.Printf("Running task \"%s\"... ", taskName)
107+
if err := taskMap[taskName].Execute(); err != nil {
108+
log.Printf("Task \"%s\" execution failed", taskName)
109+
log.Fatal(err)
110+
} else {
111+
log.Printf("Task \"%s\" passed", taskName)
112+
}
113+
}
50114
}

task/BUILD.bazel

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
load("@io_bazel_rules_go//go:def.bzl", "go_library")
2+
3+
go_library(
4+
name = "task",
5+
srcs = [
6+
"collector.go",
7+
"task.go",
8+
],
9+
importpath = "github.com/its-my-data/doubak/task",
10+
visibility = ["//visibility:public"],
11+
deps = [
12+
"//proto",
13+
"@com_github_gocolly_colly_v2//:colly",
14+
],
15+
)

task/collector.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package task
2+
3+
import (
4+
"flag"
5+
"github.com/gocolly/colly/v2"
6+
p "github.com/its-my-data/doubak/proto"
7+
"log"
8+
)
9+
10+
// Collector contains the information used by the collector.
11+
type Collector struct {
12+
user string
13+
categories []string
14+
}
15+
16+
// NewCollector returns a new collector task and initialise it.
17+
func NewCollector(categories []string) *Collector {
18+
return &Collector{
19+
user: flag.Lookup(p.Flag_categories.String()).Value.(flag.Getter).Get().(string),
20+
categories: categories,
21+
}
22+
}
23+
24+
// Precheck validates the flags.
25+
func (task *Collector) Precheck() error {
26+
// TODO: check user existance, etc.
27+
return nil
28+
}
29+
30+
// Execute starts the collection.
31+
func (task *Collector) Execute() error {
32+
// TODO: update the implementation.
33+
c := colly.NewCollector()
34+
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
35+
log.Println("Found ULR: ", e.Attr("href"))
36+
})
37+
c.OnRequest(func(r *colly.Request) {
38+
log.Println("Visiting", r.URL)
39+
})
40+
c.Visit("http://douban.com/")
41+
return nil
42+
}

task/task.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package task
2+
3+
// BaseInterface defines the interface of each type of task.
4+
type BaseInterface interface {
5+
// Checking flag combinations, validities, etc.
6+
Precheck() error
7+
8+
// Execute the task.
9+
Execute() error
10+
}

0 commit comments

Comments
 (0)