-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextraction.js
92 lines (85 loc) · 2.79 KB
/
extraction.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import helper from './helper.js'
import cheerio from 'cheerio'
const strings_meta = new RegExp('regionsAllowed|width|height|color|rgba\\(|charset|viewport|refresh|equiv', 'i')
async function extract_metadata (site, source) {
try {
const $ = cheerio.load(source)
const meta = $('meta')
const temp_metadata_list = []
const temp_metadata_for_checking = []
Object.keys(meta).forEach(function (key) {
if (meta[key].attribs) {
if (!temp_metadata_for_checking.includes(meta[key].attribs) && !strings_meta.test(JSON.stringify(meta[key].attribs))) {
temp_metadata_for_checking.push(meta[key].attribs)
const temp_dict = {}
let add = true
if (meta[key].attribs.property) {
temp_dict.property = meta[key].attribs.property
}
if (meta[key].attribs.name) {
temp_dict.name = meta[key].attribs.name
}
if (meta[key].attribs.itemprop) {
temp_dict.itemprop = meta[key].attribs.itemprop
}
if (meta[key].attribs.content) {
if (meta[key].attribs.content.replace('\n', '').replace('\t', '').replace('\r', '').trim() !== '') {
temp_dict.content = meta[key].attribs.content
}
}
['property', 'name', 'itemprop'].forEach((item, i) => {
if (temp_dict[item]) {
temp_metadata_list.forEach((_item, i) => {
if (_item[item]) {
if (_item[item] === temp_dict[item]) {
temp_metadata_list[i].content += ', ' + temp_dict.content
add = false
}
}
})
}
})
if (add && Object.keys(temp_dict).length !== 0) {
temp_metadata_list.push(temp_dict)
}
}
}
})
return temp_metadata_list
} catch (err) {
helper.verbose && console.log(err)
return []
}
}
async function extract_patterns (site, source) {
try {
const temp_patterns_list = []
const temp_patterns_for_checking = []
if ('extract' in site) {
site.extract.forEach((item, i) => {
const regex_pattern = new RegExp(item.regex, 'g')
let found = null
while (found = regex_pattern.exec(source)) {
if (!temp_patterns_for_checking.includes(found[1])) {
temp_patterns_for_checking.push(found[1])
if (item.type === 'link') {
found[1] = decodeURIComponent(found[1])
}
temp_patterns_list.push({
type: item.type,
matched: found[1]
})
}
}
})
}
return temp_patterns_list
} catch (err) {
helper.verbose && console.log(err)
return []
}
}
export default{
extract_patterns,
extract_metadata
}