// This script fetches Telegram channel preview and extracts content from it for our site. // How content is stored an rendered: // On disk: // content/news/2022-01-30/7/index.md (Telegram ID is used) // On site: // /news/2022-01-30/title-text-slug 'use strict'; const channelPreviewUrl = 'https://t.me/s/OrganicMapsApp'; const newsPath = `${__dirname}/content/news/`; const https = require('https'); const parse = require('node-html-parser').parse; const fs = require('fs'); const NodeHtmlMarkdown = require('node-html-markdown').NodeHtmlMarkdown; const nhm = new NodeHtmlMarkdown(); const photoUrlRE = /url\(\'(.*?)\'/; const emojiRE = /_\*\*(\u00a9|\u00ae|[\u2000-\u3300]\ufe0f?|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]\ufe0f?|\ud83e[\ud000-\udfff]|\uD83C[\uDDE6-\uDDFF]\uD83C[\uDDE6-\uDDFF])\*\*_/gi; const kPostsDiffInMs = 10 * 60 * 1000; // Ten minutes. function downloadAsync(url, path) { return new Promise((resolve, reject) => { const request = https.get(url, res => { if (res.statusCode != 200) return reject(new Error('statusCode=' + res.statusCode)); const filePath = fs.createWriteStream(path); res.pipe(filePath); filePath.on('finish', _ => { filePath.close(); console.log('Downloaded ' + path); resolve(); }); }); request.on('error', err => reject(err)); request.end(); }).catch(err => console.error('Download failed: ' + err)); } // Creates a title for news piece from text. function getTitle(text) { // Strip urls and text after them. const breakers = ['http', '. ', ':', '\n', '!']; for (let b = 0; b < breakers.length; ++b) { const i = text.indexOf(breakers[b]); if (i > 0) text = text.substring(0, i); } return text.trim(); } const dateRE = /-?(\d\d)?\d\d-\d\d-\d\d-?/g; const slugRE = /(.\ufe0f)?[^\p{L}\p{N}]+/gu; const kReplacer = '-'; // Removes incompatible symbols from the text. May return an empty string. function slugify(text) { text = text.toLowerCase().replace(slugRE, kReplacer); // Remove version numbers (dates) from slugs. text = text.replace(dateRE, kReplacer); if (text.length && text[0] == kReplacer) text = text.substr(1); if (text.length && text[text.length - 1] == kReplacer) text = text.substr(0, text.length - 1); return text; } const trailingSpacesRE = / +\n/sg; function toMarkdown(node, date) { const time = date.substring(11, 16); // News web page title. const title = node ? getTitle(node.structuredText) : time; let slug = slugify(title); if (!slug) slug = time.replace(':', '-'); const frontmatter = `---\ntitle: "${title.replaceAll("'", "''")}"\ndate: ${date}\nslug: "${slug}"\n---\n\n`; // Photo-only posts do not have text, but should have an index.md file. if (!node) return frontmatter; let content = nhm.translate(node.innerHTML); // Clean up some unnecessary whitespaces after nhm conversion. content = content.replaceAll(trailingSpacesRE, '\n'); // We use other dots for lists. content = content.replaceAll('• ', '* '); // Fix numbered lists like "1\. First". content = content.replaceAll(/^(\d+)\\\. /mg, '$1. '); // Telegram makes emoji bold italic. content = content.replaceAll(emojiRE, '$1'); return frontmatter + content + '\n'; // Trailing newline for consistency. } if (process.argv.length > 2) { // Load file from the command line. console.log('Loading file ' + process.argv[2]); const html = fs.readFileSync(process.argv[2]); parseHtml(html); } else { // Download Telegram channel preview html. console.log('Downloading ' + channelPreviewUrl); const request = https.get(channelPreviewUrl, response => { if (response.statusCode != 200) throw new Error('statusCode=' + response.statusCode); let chunks = []; response.on('data', fragments => chunks.push(fragments)); response.on('end', _ => { const body = Buffer.concat(chunks); parseHtml(body.toString()); }); response.on('error', err => console.log(err)); }); request.on('error', err => console.log(err)); request.end(); } function parseHtml(html) { const root = parse(html); // Skip service messages. const messages = root.querySelectorAll('.tgme_widget_message:not(.service_message)'); console.log('Parsing ' + messages.length + ' Telegram posts.'); const downloads = []; let prevDir; let prevDate; messages.forEach(m => { let photos = m.querySelectorAll('.tgme_widget_message_photo_wrap'); let videos = m.querySelectorAll('video'); let text = m.querySelector('.tgme_widget_message_text'); // Video files contain time tag with video length. Filter it out. let date = m.querySelectorAll('time').filter(el => el.hasAttribute('datetime'))[0].getAttribute('datetime'); const id = m.getAttribute('data-post').split('/').pop(); const yyyyMMdd = date.substring(0, 10); const dir = `${newsPath}${yyyyMMdd}/${id}`;; if (!fs.existsSync(dir)) { console.log('Creating ' + dir); fs.mkdirSync(dir, { recursive: true }); fs.writeFileSync(dir + '/../_index.md', `---\nsort_by: date\ntemplate: news/date_section.html\n---`); } // Each message may have 0 or more photos. for (let i = 1; i <= photos.length; ++i) { let photo = photos[i - 1].getAttribute('style'); photo = photo.match(photoUrlRE)[1]; const ext = photo.split('.').pop(); const fileName = `${i}.${ext}`; // Handle special cases when image is published as a separate message immediately after // the main text message (Telegram has 1024/2048 chars limit for image caption). if (!text && prevDate && (new Date(date) - new Date(prevDate)) <= kPostsDiffInMs) { // Do not download if images already exist. if (!fs.existsSync(`${prevDir}/${i}.png`) && !fs.existsSync(`${prevDir}/${i}.webp`) && !fs.existsSync(`${prevDir}/${i}.jpg`) && !fs.existsSync(`${prevDir}/${i}.jpeg`)) { downloads.push(downloadAsync(photo, `${prevDir}/${fileName}`)); //console.log('Removing ' + dir); //fs.rmSync(dir, { recursive: true, force: true }); } } else { downloads.push(downloadAsync(photo, `${dir}/${fileName}`)); } } // Each message may have 0 or more videos. for (let i = 1; i <= videos.length; ++i) { let videoUrl = videos[i - 1].getAttribute('src'); let fileName = new URL(videoUrl).pathname.split('/').pop(); // Handle special cases when video is published as a separate message immediately after // the main text message (Telegram has 1024/2048 chars limit for media post's caption). if (!text && prevDate && (new Date(date) - new Date(prevDate)) <= kPostsDiffInMs) { // Do not download if video already exist. if (!fs.existsSync(`${prevDir}/${fileName}`)) { downloads.push(downloadAsync(videoUrl, `${prevDir}/${fileName}`)); } } else { downloads.push(downloadAsync(videoUrl, `${dir}/${fileName}`)); } } const markdown = toMarkdown(text, date); fs.writeFile(`${dir}/index.md`, markdown, err => { if (err) console.error(err); }); prevDir = dir; prevDate = date; }); }