Skip to content

Commit 5accfa6

Browse files
authored
Merge pull request #406 from extractus/8.0.13
v8.0.13
2 parents 39616dc + cc89afc commit 5accfa6

File tree

7 files changed

+261
-1
lines changed

7 files changed

+261
-1
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"version": "8.0.12",
2+
"version": "8.0.13",
33
"name": "@extractus/article-extractor",
44
"description": "To extract main article from given URL",
55
"homepage": "https://github.com/extractus/article-extractor",

src/utils/extractMetaData.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import { DOMParser } from 'linkedom'
44
import extractLdSchema from './extractLdSchema.js'
5+
import findDate from './findDate.js'
56

67
/**
78
* @param {Element} node
@@ -143,5 +144,10 @@ export default (html) => {
143144
})
144145

145146
const entries = extractLdSchema(doc, entry)
147+
148+
if (!entries.published) {
149+
entries.published = findDate(doc)
150+
}
151+
146152
return entries
147153
}

src/utils/extractMetaData.test.js

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ import extractMetaData from './extractMetaData.js'
1010

1111
const keys = 'url shortlink amphtml canonical title description image author source published favicon type'.split(' ')
1212

13+
function isDateString (date) {
14+
if (typeof date !== 'string') return false
15+
const d = new Date(date)
16+
return !isNaN(d.getTime())
17+
}
18+
1319
describe('test extractMetaData', () => {
1420
it('test extractMetaData(good content)', async () => {
1521
const html = readFileSync('./test-data/regular-article.html', 'utf8')
@@ -28,4 +34,24 @@ describe('test extractMetaData', () => {
2834
assert.ok(hasProperty(result, k))
2935
})
3036
})
37+
38+
it('test extractMetaData(find date)', async () => {
39+
const html1 = readFileSync('./test-data/regular-article-date-time.html', 'utf8')
40+
const html2 = readFileSync('./test-data/regular-article-date-itemprop.html', 'utf8')
41+
const html3 = readFileSync('./test-data/regular-article-date-span.html', 'utf8')
42+
const result1 = extractMetaData(html1)
43+
const result2 = extractMetaData(html2)
44+
const result3 = extractMetaData(html3)
45+
assert.ok(isObject(result1))
46+
assert.ok(isObject(result2))
47+
assert.ok(isObject(result3))
48+
keys.forEach((k) => {
49+
assert.ok(hasProperty(result1, k))
50+
assert.ok(hasProperty(result3, k))
51+
assert.ok(hasProperty(result3, k))
52+
})
53+
assert.ok(isDateString(result1.published))
54+
assert.ok(isDateString(result2.published))
55+
assert.ok(isDateString(result3.published))
56+
})
3157
})

src/utils/findDate.js

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
2+
/**
3+
* Convert date format to YYYY-MM-DD
4+
*
5+
* @param {string} dateString
6+
* @returns {string} YYYY-MM-DD
7+
*/
8+
function convertDateFormat (dateString) {
9+
const parts = dateString.split('/')
10+
if (parts.length !== 3) return dateString
11+
12+
let year, month, day
13+
14+
if (parseInt(parts[0]) > 12) {
15+
[day, month, year] = parts
16+
} else {
17+
[month, day, year] = parts
18+
}
19+
20+
year = year.length === 2 ? '20' + year : year
21+
return `${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}T00:00:00`
22+
}
23+
24+
/**
25+
* Look for the publication date in the body of the content.
26+
*
27+
* @param {Document} document - The HTML Document
28+
* @returns {string} The date string
29+
*/
30+
export default function (doc) {
31+
const datePatterns = [
32+
/\d{4}-\d{2}-\d{2}/,
33+
/\d{1,2}\/\d{1,2}\/\d{2,4}/,
34+
]
35+
36+
const findDate = (element) => {
37+
for (const pattern of datePatterns) {
38+
const match = element.textContent.match(pattern)
39+
if (match) return convertDateFormat(match[0])
40+
}
41+
return null
42+
}
43+
44+
const priorityElements = doc.querySelectorAll('time, [datetime], [itemprop~=datePublished], [itemprop~=dateCreated]')
45+
for (const el of priorityElements) {
46+
const date = el.getAttribute('datetime') || el.getAttribute('content') || findDate(el)
47+
if (date) return date
48+
}
49+
50+
const secondaryElements = doc.querySelectorAll('p, span, div')
51+
for (const el of secondaryElements) {
52+
const date = findDate(el)
53+
if (date) return date
54+
}
55+
56+
return null
57+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
<!doctype html>
2+
<html>
3+
<head>
4+
<meta charset="utf-8">
5+
<meta name="viewport" content="width=device-width, initial-scale=1">
6+
<title>Article title here - ArticleParser</title>
7+
<meta name="author" content="Alice">
8+
<meta name="description" content="Few words about this article">
9+
<meta name="keywords" content="alpha, beta, gamma">
10+
<meta name="twitter:site" content="@ArticleParser">
11+
<meta name="twitter:url" content="https://somewhere.com/path/to/article-title-here">
12+
<meta name="twitter:card" content="summary_large_image">
13+
<meta name="twitter:image" content="https://somewhere.com/path/to/image.jpg">
14+
<meta name="twitter:creator" content="@alice">
15+
<meta property="og:title" content="Article title here">
16+
<meta property="og:type" content="article">
17+
<meta property="og:url" content="https://somewhere.com/path/to/article-title-here">
18+
<meta property="og:description" content="Navigation here Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs.">
19+
<meta property="og:image" content="https://somewhere.com/path/to/image.jpg">
20+
21+
<link rel="stylesheet" href="/path/to/cssfile.css">
22+
<link rel="canonical" href="https://somewhere.com/another/path/to/article-title-here">
23+
<link rel="amphtml" href="https://m.somewhere.com/another/path/to/article-title-here.amp">
24+
<link rel="shortlink" href="https://sw.re/419283">
25+
<link rel="icon" href="https://somewhere.com/favicon.ico">
26+
27+
<link rel="alternate" title="ArticleParser" type="application/atom+xml" href="https://somewhere.com/atom.xml">
28+
29+
<link rel="manifest" href="/manifest.json">
30+
</head>
31+
<body>
32+
<header>Page header here</header>
33+
<main>
34+
<section>
35+
<nav>Navigation here</nav>
36+
</section>
37+
<section>
38+
<h1>Article title here</h1>
39+
<article>
40+
<div itemprop="datePublished" datetime='2024-10-16T07:33+03:00' class='published'> Published 11/09/2024 07h33min</div>
41+
42+
<div class="contentdetail">Few can name a <a href="https://otherwhere.com/descriptions/rational-peach">rational peach</a> that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.</div>
43+
<p class="contentdetail">
44+
Those cheetahs are nothing more than dogs. A <a href="/dict/watermelon">watermelon</a> is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.</p>
45+
<p>The first fair dog is, in its own way, a lemon.</p>
46+
<address>4746 Kelly Drive, West Virginia</address>
47+
<img src="./orange.png" style="border: solid 1px #000">
48+
</article>
49+
</section>
50+
<section class="sidebar-widget">
51+
<widget>Some widget here</widget>
52+
<widget>Some widget here</widget>
53+
</section>
54+
</main>
55+
<footer>Page footer here</footer>
56+
</body>
57+
</html>
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
<!doctype html>
2+
<html>
3+
<head>
4+
<meta charset="utf-8">
5+
<meta name="viewport" content="width=device-width, initial-scale=1">
6+
<title>Article title here - ArticleParser</title>
7+
<meta name="author" content="Alice">
8+
<meta name="description" content="Few words about this article">
9+
<meta name="keywords" content="alpha, beta, gamma">
10+
<meta name="twitter:site" content="@ArticleParser">
11+
<meta name="twitter:url" content="https://somewhere.com/path/to/article-title-here">
12+
<meta name="twitter:card" content="summary_large_image">
13+
<meta name="twitter:image" content="https://somewhere.com/path/to/image.jpg">
14+
<meta name="twitter:creator" content="@alice">
15+
<meta property="og:title" content="Article title here">
16+
<meta property="og:type" content="article">
17+
<meta property="og:url" content="https://somewhere.com/path/to/article-title-here">
18+
<meta property="og:description" content="Navigation here Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs.">
19+
<meta property="og:image" content="https://somewhere.com/path/to/image.jpg">
20+
21+
<link rel="stylesheet" href="/path/to/cssfile.css">
22+
<link rel="canonical" href="https://somewhere.com/another/path/to/article-title-here">
23+
<link rel="amphtml" href="https://m.somewhere.com/another/path/to/article-title-here.amp">
24+
<link rel="shortlink" href="https://sw.re/419283">
25+
<link rel="icon" href="https://somewhere.com/favicon.ico">
26+
27+
<link rel="alternate" title="ArticleParser" type="application/atom+xml" href="https://somewhere.com/atom.xml">
28+
29+
<link rel="manifest" href="/manifest.json">
30+
</head>
31+
<body>
32+
<header>Page header here</header>
33+
<main>
34+
<section>
35+
<nav>Navigation here</nav>
36+
</section>
37+
<section>
38+
<h1>Article title here</h1>
39+
<article>
40+
<span class='published'> Published at 11/09/2024 07h33 am</span>
41+
42+
<div class="contentdetail">Few can name a <a href="https://otherwhere.com/descriptions/rational-peach">rational peach</a> that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.</div>
43+
<p class="contentdetail">
44+
Those cheetahs are nothing more than dogs. A <a href="/dict/watermelon">watermelon</a> is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.</p>
45+
<p>The first fair dog is, in its own way, a lemon.</p>
46+
<address>4746 Kelly Drive, West Virginia</address>
47+
<img src="./orange.png" style="border: solid 1px #000">
48+
</article>
49+
</section>
50+
<section class="sidebar-widget">
51+
<widget>Some widget here</widget>
52+
<widget>Some widget here</widget>
53+
</section>
54+
</main>
55+
<footer>Page footer here</footer>
56+
</body>
57+
</html>
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
<!doctype html>
2+
<html>
3+
<head>
4+
<meta charset="utf-8">
5+
<meta name="viewport" content="width=device-width, initial-scale=1">
6+
<title>Article title here - ArticleParser</title>
7+
<meta name="author" content="Alice">
8+
<meta name="description" content="Few words about this article">
9+
<meta name="keywords" content="alpha, beta, gamma">
10+
<meta name="twitter:site" content="@ArticleParser">
11+
<meta name="twitter:url" content="https://somewhere.com/path/to/article-title-here">
12+
<meta name="twitter:card" content="summary_large_image">
13+
<meta name="twitter:image" content="https://somewhere.com/path/to/image.jpg">
14+
<meta name="twitter:creator" content="@alice">
15+
<meta property="og:title" content="Article title here">
16+
<meta property="og:type" content="article">
17+
<meta property="og:url" content="https://somewhere.com/path/to/article-title-here">
18+
<meta property="og:description" content="Navigation here Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs.">
19+
<meta property="og:image" content="https://somewhere.com/path/to/image.jpg">
20+
21+
<link rel="stylesheet" href="/path/to/cssfile.css">
22+
<link rel="canonical" href="https://somewhere.com/another/path/to/article-title-here">
23+
<link rel="amphtml" href="https://m.somewhere.com/another/path/to/article-title-here.amp">
24+
<link rel="shortlink" href="https://sw.re/419283">
25+
<link rel="icon" href="https://somewhere.com/favicon.ico">
26+
27+
<link rel="alternate" title="ArticleParser" type="application/atom+xml" href="https://somewhere.com/atom.xml">
28+
29+
<link rel="manifest" href="/manifest.json">
30+
</head>
31+
<body>
32+
<header>Page header here</header>
33+
<main>
34+
<section>
35+
<nav>Navigation here</nav>
36+
</section>
37+
<section>
38+
<h1>Article title here</h1>
39+
<article>
40+
<time datetime='2024-10-16T07:33+03:00' class='published'> Published 11/09/2024 07h33min</time>
41+
42+
<div class="contentdetail">Few can name a <a href="https://otherwhere.com/descriptions/rational-peach">rational peach</a> that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.</div>
43+
<p class="contentdetail">
44+
Those cheetahs are nothing more than dogs. A <a href="/dict/watermelon">watermelon</a> is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.</p>
45+
<p>The first fair dog is, in its own way, a lemon.</p>
46+
<address>4746 Kelly Drive, West Virginia</address>
47+
<img src="./orange.png" style="border: solid 1px #000">
48+
</article>
49+
</section>
50+
<section class="sidebar-widget">
51+
<widget>Some widget here</widget>
52+
<widget>Some widget here</widget>
53+
</section>
54+
</main>
55+
<footer>Page footer here</footer>
56+
</body>
57+
</html>

0 commit comments

Comments
 (0)