Go 实现词频统计

a1ec08fa513d2697d52da0805cfbb2fb4216d875.jpeg

功能

  • 统计多个文件中英文单词出现的次数
  • 按照词频从多到少排序输出
  • 支持并发

实现

  • 创建 file.txt 内容如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
You were the shadow to my light,Did you feel us?
Another start,You fade away
Afraid our aim is out of sight,Wanna see us,Alive
Where are you now? Where are you now?Where are you now?
Was it all in my fantasy?Where are you now?Were you only imaginary?
Where are you now? Atlantis Under the sea Under the sea
Where are you now? Another dream,The monster's running wild inside of me
I'm faded I'm faded So lost, I'm faded
I'm faded So lost, I'm faded
These shallow waters never met what I needed I'm letting go a deeper dive
Eternal silence of the sea. I'm breathing alive
Where are you now? Where are you now?
Under the bright but faded lights You've set my heart on fire
Where are you now? Where are you now?
Where are you now? Atlantis
Under the sea Under the sea
Where are you now? Another dream The monster's running wild inside of me
I'm faded I'm faded So lost, I'm faded
I'm faded So lost, I'm faded
  • 创建 wordcount.go 内容如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
package main

import (
"bufio"
"fmt"
"io"
"log"
"os"
"sort"
"strings"
"unicode"
"unicode/utf8"
"path/filepath"
)

type Pair struct {
Key string
Value int
}

// PariList实现了sort接口,可以使用sort.Sort对其排序

type PairList []Pair

func (p PairList) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func (p PairList) Len() int { return len(p) }
func (p PairList) Less(i, j int) bool { return p[j].Value < p[i].Value } // 逆序

// 提取单词
func SplitOnNonLetters(s string) []string {
notALetter := func(char rune) bool { return !unicode.IsLetter(char) }
return strings.FieldsFunc(s, notALetter)
}

/*
基于map实现了类型WordCount, 并对期实现了Merge(), Report(), SortReport(), UpdateFreq(),
WordFreqCounter() 方法
*/

type WordCount map[string]int

// 用于合并两个WordCount
func (source WordCount) Merge(wordcount WordCount) WordCount {
for k, v := range wordcount {
source[k] += v
}

return source
}

// 打印词频统计情况
func (wordcount WordCount) Report() {
words := make([]string, 0, len(wordcount))
wordWidth, frequencyWidth := 0, 0
for word, frequency := range wordcount {
words = append(words, word)
if width := utf8.RuneCountInString(word); width > wordWidth {
wordWidth = width
}
if width := len(fmt.Sprint(frequency)); width > frequencyWidth {
frequencyWidth = width
}
}
sort.Strings(words)
gap := wordWidth + frequencyWidth - len("Word") - len("Frequency")
fmt.Printf("Word %*s%s\n", gap, " ", "Frequency")
for _, word := range words {
fmt.Printf("%-*s %*d\n", wordWidth, word, frequencyWidth,
wordcount[word])
}
}

// 从多到少打印词频
func (wordcount WordCount) SortReport() {
p := make(PairList, len(wordcount))
i := 0
for k, v := range wordcount { // 将wordcount map转换成PairList
p[i] = Pair{k, v}
i++
}

sort.Sort(p) // 因为PairList实现了排序接口,所以可以使用sort.Sort()对其排序

wordWidth, frequencyWidth := 0, 0
for _, pair := range p {
word, frequency := pair.Key, pair.Value
if width := utf8.RuneCountInString(word); width > wordWidth {
wordWidth = width
}
if width := len(fmt.Sprint(frequency)); width > frequencyWidth {
frequencyWidth = width
}
}
gap := wordWidth + frequencyWidth - len("Word") - len("Frequency")
fmt.Printf("Word %*s%s\n", gap, " ", "Frequency")

for _, pair := range p {
fmt.Printf("%-*s %*d\n", wordWidth, pair.Key, frequencyWidth,
pair.Value)
}

}

// 从文件中读取单词,并更新其出现的次数
func (wordcount WordCount) UpdateFreq(filename string) {
var file *os.File
var err error

if file, err = os.Open(filename); err != nil {
log.Println("failed to open the file: ", err)
return
}
defer file.Close() // 本函数退出之前时,关闭文件

reader := bufio.NewReader(file)
for {
line, err := reader.ReadString('\n')
for _, word := range SplitOnNonLetters(strings.TrimSpace(line)) {
if len(word) > utf8.UTFMax ||
utf8.RuneCountInString(word) > 1 {
wordcount[strings.ToLower(word)] += 1
}
}
if err != nil {
if err != io.EOF {
log.Println("failed to finish reading the file: ", err)
}
break
}
}
}

// 并发统计单词频次
func (wordcount WordCount) WordFreqCounter(files []string) {

results := make(chan Pair, len(files)) // goroutine 将结果发送到该channel
done := make(chan struct{}, len(files)) // 每个goroutine工作完成后,发送一个空结构体到该channel,表示工作完成

for i := 0; i < len(files); { // 有多少个文件就开启多少个goroutine, 使用匿名函数的方式
go func(done chan<- struct{}, results chan<- Pair, filename string) {
wordcount := make(WordCount)
wordcount.UpdateFreq(filename)
for k, v := range wordcount {
pair := Pair{k, v}
results <- pair
}
done <- struct{}{}
}(done, results, files[i])

i++
}

for working := len(files); working > 0; { // 监听通道,直到所有的工作goroutine完成任务时才退出
select {
case pair := <-results: // 接收发送到通道中的统计结果
wordcount[pair.Key] += pair.Value

case <-done: // 判断工作goroutine是否全部完成
working--

}
}

DONE: // 再次启动for循环处理通道中还未处理完的值
for {
select {
case pair := <-results:
wordcount[pair.Key] += pair.Value
default:
break DONE
}
}

close(results)
close(done)

}


func main() {
if len(os.Args) == 1 || os.Args[1] == "-h" || os.Args[1] == "--help" {
fmt.Printf("usage: %s <file1> [<file2> [... <fileN>]]\n",
filepath.Base(os.Args[0]))
os.Exit(1)
}

wordcounter := make(WordCount)

wordcounter.WordFreqCounter(os.Args[1:])

wordcounter.SortReport()
}
  • 编译
1
2
3
$ go build wordcount.go
$ ls
wordcount wordcount.go

执行go build wordcount.go对 wordcount.go 进行编译。执行后,查看当前目录已经有了一个可执行文件 wordcount

  • 执行
1
2
3
4
5
6
7
$ ./wordcount file.txt |head -n 6
Word Frequency
you 17
where 12
are 12
now 12
faded 11

为了方便输出,这里只打印了排名前 5 的单词

-------------本文结束感谢您的阅读-------------
0%