|
1 | 1 | package main |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "errors" |
4 | 5 | "flag" |
5 | | - "fmt" |
6 | | - "github.com/gocolly/colly/v2" |
7 | | - "github.com/its-my-data/doubak/collector" |
8 | 6 | p "github.com/its-my-data/doubak/proto" |
| 7 | + "github.com/its-my-data/doubak/task" |
| 8 | + "log" |
9 | 9 | "math" |
| 10 | + "regexp" |
| 11 | + "strings" |
10 | 12 | "time" |
11 | 13 | ) |
12 | 14 |
|
@@ -34,17 +36,79 @@ var requestDelay = flag.Duration(p.Flag_req_delay.String(), defaultRequestDelay, |
34 | 36 | "Min time between any two requests, used to reduce server load. This may "+ |
35 | 37 | "be replaced by a QPS flag when proxy pool and parallel requests are implemented.") |
36 | 38 |
|
| 39 | +func validateFlags() (tasks []string, categories []string, err error) { |
| 40 | + spaceRegex := regexp.MustCompile(`\s`) |
| 41 | + |
| 42 | + // Validate task list (order matters). |
| 43 | + strippedTasks := spaceRegex.ReplaceAllString(*tasksToRun, "") |
| 44 | + tasks = strings.Split(strippedTasks, ",") |
| 45 | + for _, t := range tasks { |
| 46 | + if _, ok := p.Task_value[t]; !ok { |
| 47 | + err = errors.New("unknown task name: " + t) |
| 48 | + return |
| 49 | + } |
| 50 | + } |
| 51 | + |
| 52 | + // Validate category list (order doesn't matter). |
| 53 | + strippedCategories := spaceRegex.ReplaceAllString(*targetCategories, "") |
| 54 | + categories = strings.Split(strippedCategories, ",") |
| 55 | + for _, c := range categories { |
| 56 | + if _, ok := p.Category_value[c]; !ok { |
| 57 | + err = errors.New("unknown category name: " + c) |
| 58 | + return |
| 59 | + } |
| 60 | + } |
| 61 | + |
| 62 | + return |
| 63 | +} |
| 64 | + |
37 | 65 | func main() { |
38 | 66 | flag.Parse() |
39 | 67 |
|
40 | | - collector.Collect() |
| 68 | + // Precheck flags that need preprosessing. |
| 69 | + log.Print("Validating flags... ") |
| 70 | + tasks, categories, parseErr := validateFlags() |
| 71 | + if parseErr != nil { |
| 72 | + log.Print("FAILED") |
| 73 | + log.Fatal(parseErr) |
| 74 | + } else { |
| 75 | + log.Print("PASS") |
| 76 | + } |
| 77 | + |
| 78 | + // Create selected tasks. |
| 79 | + taskMap := map[string]task.BaseInterface{} |
| 80 | + for _, t := range tasks { |
| 81 | + var taskImpl task.BaseInterface |
| 82 | + switch t { |
| 83 | + case p.Task_collect.String(): |
| 84 | + taskImpl = task.NewCollector(categories) |
| 85 | + |
| 86 | + // TODO: add other tasks. |
| 87 | + // case p.Task_parse: |
| 88 | + // case p.Task_publish: |
| 89 | + } |
| 90 | + taskMap[t] = taskImpl |
| 91 | + } |
| 92 | + |
| 93 | + // Run the specific tasks' prechecks first. |
| 94 | + for taskName, t := range taskMap { |
| 95 | + log.Printf("Prechecking \"%s\"... ", taskName) |
| 96 | + if err := t.Precheck(); err != nil { |
| 97 | + log.Print("FAILED") |
| 98 | + log.Fatal(err) |
| 99 | + } else { |
| 100 | + log.Print("PASS") |
| 101 | + } |
| 102 | + } |
41 | 103 |
|
42 | | - c := colly.NewCollector() |
43 | | - c.OnHTML("a[href]", func(e *colly.HTMLElement) { |
44 | | - fmt.Println("Found ULR: ", e.Attr("href")) |
45 | | - }) |
46 | | - c.OnRequest(func(r *colly.Request) { |
47 | | - fmt.Println("Visiting", r.URL) |
48 | | - }) |
49 | | - c.Visit("http://douban.com/") |
| 104 | + // Execute the tasks in input order. |
| 105 | + for _, taskName := range tasks { |
| 106 | + log.Printf("Running task \"%s\"... ", taskName) |
| 107 | + if err := taskMap[taskName].Execute(); err != nil { |
| 108 | + log.Printf("Task \"%s\" execution failed", taskName) |
| 109 | + log.Fatal(err) |
| 110 | + } else { |
| 111 | + log.Printf("Task \"%s\" passed", taskName) |
| 112 | + } |
| 113 | + } |
50 | 114 | } |
0 commit comments