website/update_news.js

// This script fetches Telegram channel preview and extracts content from it for our site.
// How content is stored an rendered:
// On disk:
// content/news/2022-01-30/7/index.md  (Telegram ID is used)
// On site:
// /news/2022-01-30/title-text-slug

'use strict';
const channelPreviewUrl = 'https://t.me/s/OrganicMapsApp';
const newsPath = `${__dirname}/content/news/`;

const https = require('https');
const parse = require('node-html-parser').parse;
const fs = require('fs');
const NodeHtmlMarkdown = require('node-html-markdown').NodeHtmlMarkdown;
const nhm = new NodeHtmlMarkdown();
const photoUrlRE = /url\(\'(.*?)\'/;
const emojiRE = /_\*\*(\u00a9|\u00ae|[\u2000-\u3300]\ufe0f?|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]\ufe0f?|\ud83e[\ud000-\udfff]|\uD83C[\uDDE6-\uDDFF]\uD83C[\uDDE6-\uDDFF])\*\*_/gi;

const kPostsDiffInMs = 10 * 60 * 1000;  // Ten minutes.

function downloadAsync(url, path) {
  return new Promise((resolve, reject) => {
    const request = https.get(url, res => {
      if (res.statusCode != 200) return reject(new Error('statusCode=' + res.statusCode));
      const filePath = fs.createWriteStream(path);
      res.pipe(filePath);
      filePath.on('finish', _ => {
        filePath.close();
        console.log('Downloaded ' + path);
        resolve();
      });
    });
    request.on('error', err => reject(err));
    request.end();
  }).catch(err => console.error('Download failed: ' + err));
}

// Creates a title for news piece from text.
function getTitle(text) {
  // Strip urls and text after them.
  const breakers = ['http', '. ', ':', '\n', '!'];
  for (let b = 0; b < breakers.length; ++b) {
    const i = text.indexOf(breakers[b]);
    if (i > 0) text = text.substring(0, i);
  }
  return text.trim();
}

const dateRE = /-?(\d\d)?\d\d-\d\d-\d\d-?/g;
const slugRE = /(.\ufe0f)?[^\p{L}\p{N}]+/gu;
const kReplacer = '-';
// Removes incompatible symbols from the text. May return an empty string.
function slugify(text) {
  text = text.toLowerCase().replace(slugRE, kReplacer);
  // Remove version numbers (dates) from slugs.
  text = text.replace(dateRE, kReplacer);
  if (text.length && text[0] == kReplacer) text = text.substr(1);
  if (text.length && text[text.length - 1] == kReplacer) text = text.substr(0, text.length - 1);
  return text;
}

const trailingSpacesRE = / +\n/sg;

function toMarkdown(node, date) {
  const time = date.substring(11, 16);
  // News web page title.
  const title = node ? getTitle(node.structuredText) : time;
  let slug = slugify(title);
  if (!slug) slug = time.replace(':', '-');

  const frontmatter = `---\ntitle: "${title.replaceAll("'", "''")}"\ndate: ${date}\nslug: "${slug}"\n---\n\n`;
  // Photo-only posts do not have text, but should have an index.md file.
  if (!node) return frontmatter;

  let content = nhm.translate(node.innerHTML);
  // Clean up some unnecessary whitespaces after nhm conversion.
  content = content.replaceAll(trailingSpacesRE, '\n');
  // We use other dots for lists.
  content = content.replaceAll('• ', '* ');
  // Fix numbered lists like "1\. First".
  content = content.replaceAll(/^(\d+)\\\. /mg, '$1. ');
  // Telegram makes emoji bold italic.
  content = content.replaceAll(emojiRE, '$1');
  return frontmatter + content + '\n'; // Trailing newline for consistency.
}

if (process.argv.length > 2) {
  // Load file from the command line.
  console.log('Loading file ' + process.argv[2]);
  const html = fs.readFileSync(process.argv[2]);
  parseHtml(html);
} else {
  // Download Telegram channel preview html.
  console.log('Downloading ' + channelPreviewUrl);
  const request = https.get(channelPreviewUrl, response => {
    if (response.statusCode != 200) throw new Error('statusCode=' + response.statusCode);
    let chunks = [];
    response.on('data', fragments => chunks.push(fragments));
    response.on('end', _ => {
      const body = Buffer.concat(chunks);
      parseHtml(body.toString());
    });
    response.on('error', err => console.log(err));
  });
  request.on('error', err => console.log(err));
  request.end();
}

function parseHtml(html) {
  const root = parse(html);

  // Skip service messages.
  const messages = root.querySelectorAll('.tgme_widget_message:not(.service_message)');
  console.log('Parsing ' + messages.length + ' Telegram posts.');
  const downloads = [];
  let prevDir;
  let prevDate;
  messages.forEach(m => {
    let photos = m.querySelectorAll('.tgme_widget_message_photo_wrap');
    let videos = m.querySelectorAll('video');
    let text = m.querySelector('.tgme_widget_message_text');
    // Video files contain time tag with video length. Filter it out.
    let date = m.querySelectorAll('time').filter(el => el.hasAttribute('datetime'))[0].getAttribute('datetime');
    const id = m.getAttribute('data-post').split('/').pop();

    const yyyyMMdd = date.substring(0, 10);
    const dir = `${newsPath}${yyyyMMdd}/${id}`;;
    if (!fs.existsSync(dir)) {
      console.log('Creating ' + dir);
      fs.mkdirSync(dir, { recursive: true });
      fs.writeFileSync(dir + '/../_index.md', `---\nsort_by: date\ntemplate: news/date_section.html\n---`);
    }

    // Each message may have 0 or more photos.
    for (let i = 1; i <= photos.length; ++i) {
      let photo = photos[i - 1].getAttribute('style');
      photo = photo.match(photoUrlRE)[1];
      const ext = photo.split('.').pop();
      const fileName = `${i}.${ext}`;
      // Handle special cases when image is published as a separate message immediately after
      // the main text message (Telegram has 1024/2048 chars limit for image caption).
      if (!text && prevDate && (new Date(date) - new Date(prevDate)) <= kPostsDiffInMs) {
        // Do not download if images already exist.
        if (!fs.existsSync(`${prevDir}/${i}.png`)
            && !fs.existsSync(`${prevDir}/${i}.webp`)
            && !fs.existsSync(`${prevDir}/${i}.jpg`)
            && !fs.existsSync(`${prevDir}/${i}.jpeg`)) {
          downloads.push(downloadAsync(photo, `${prevDir}/${fileName}`));
          //console.log('Removing ' + dir);
          //fs.rmSync(dir, { recursive: true, force: true });
        }
      } else {
        downloads.push(downloadAsync(photo, `${dir}/${fileName}`));
      }
    }

    // Each message may have 0 or more videos.
    for (let i = 1; i <= videos.length; ++i) {
      let videoUrl = videos[i - 1].getAttribute('src');
      let fileName = new URL(videoUrl).pathname.split('/').pop();
      // Handle special cases when video is published as a separate message immediately after
      // the main text message (Telegram has 1024/2048 chars limit for media post's caption).
      if (!text && prevDate && (new Date(date) - new Date(prevDate)) <= kPostsDiffInMs) {
        // Do not download if video already exist.
        if (!fs.existsSync(`${prevDir}/${fileName}`)) {
          downloads.push(downloadAsync(videoUrl, `${prevDir}/${fileName}`));
        }
      } else {
        downloads.push(downloadAsync(videoUrl, `${dir}/${fileName}`));
      }
    }

    const markdown = toMarkdown(text, date);
    fs.writeFile(`${dir}/index.md`, markdown, err => {
      if (err) console.error(err);
    });

    prevDir = dir;
    prevDate = date;
  });
}