Page MenuHomePhorge

No OneTemporary

Size
45 KB
Referenced Files
None
Subscribers
None
diff --git a/.formatter.exs b/.formatter.exs
index d2cda26..2acd350 100644
--- a/.formatter.exs
+++ b/.formatter.exs
@@ -1,4 +1,7 @@
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
+# SPDX-License-Identifier: LGPL-2.1-only
+
# Used by "mix format"
[
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
]
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ee1665a..efe5acd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,38 +1,41 @@
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
+# SPDX-License-Identifier: LGPL-2.1-only
+
image: elixir:1.12-alpine
variables:
MIX_ENV: test
GIT_SUBMODULE_STRATEGY: recursive
cache:
key: ${CI_COMMIT_REF_SLUG}
paths:
- deps
- _build
stages:
- test
- publish
before_script:
- apk add build-base cmake
- mix local.hex --force
- mix local.rebar --force
- mix deps.get --only test
- mix compile --force
lint:
stage: test
script:
- mix format --check-formatted
unit-testing:
stage: test
coverage: '/(\d+\.\d+\%) \| Total/'
script:
- mix test --trace --preload-modules --cover
dialyzer:
stage: test
script:
- mix dialyzer
diff --git a/.gitmodules b/.gitmodules
index c842c8c..2dea083 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,8 @@
# SPDX-FileCopyrightText: 2017-2019 myhtmlex authors <https://github.com/Overbryd/myhtmlex>
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
# SPDX-License-Identifier: LGPL-2.1-only
[submodule "c_src/lexbor"]
path = c_src/lexbor
url = https://github.com/lanodan/lexbor.git
branch = bugfix/gcc-10
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e634b71..8a651eb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,58 +1,63 @@
+<!--
+SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
+SPDX-License-Identifier: LGPL-2.1-only
+-->
+
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [2.0.4] - 2020-09-01
#### Fixed
- Regression of the gcc 10 fix in the hex package, due to an outdated submodule on the publishing machine
## [2.0.3] - 2020-08-30
### Changed
- Improved error message on make errors
## [2.0.2] - 2020-08-26
### Fixed
- lexbor compilation errors with gcc 10
- Inability to use the library with distillery releases due to priv dir being in the source code
## [2.0.1] - 2020-08-04
### Fixed
- Build failures using the hex package due to CMake cache files accidentally included there
### Added
- Changelog is now available at hexdocs
## [2.0.0] - 2020-08-01
### Changed
- **Breaking:** CMake is now required at compile-time due to it being lexbor's build system
- **Breaking:** namespaces are no longer automatically appended. i.e `<svg> </svg>` will be `{"svg", [], []}` instead of `{"svg:svg", [], []}`
- **Breaking:** when using `:nil_self_closing` flag, only valid [void elements](https://html.spec.whatwg.org/#void-elements) will have `nil` in children
- Now deprecated myhtml was switched to [lexbor](https://github.com/lexbor/lexbor)
- The worker process now communicates with the node via stdio, instead of TCP, which was known to cause issues
on BSD systems
### Added
- `FastHtml.Pool` for fast_html workers. There is a default pool of `System.schedulers_online/0` workers, but a custom pool can be started if desired, or it can be disabled altogether. See `FastHtml.Pool` module documentation for more info
## [1.0.3] - 2020-02-10
### Fixed
- C-Node not respawning after being killed.
## [1.0.2] - 2020-02-10
### Fixed
- Incorrect behavior when parsing empty attribute values. Instead of an empty string the attribute name was returned.
## [1.0.1] - 2019-12-11
### Added
- `:fast_html.decode_fragment`
### Fixed
- Errors from C-Node not being reported, timing out instead
## [1.0.0] - 2019-12-02
### Changed
- **BREAKING:** `:fast_html.decode` now returns an array of nodes at the top level, instead of a single node. This was done because it's possible to have more than one root node, for example in (`<!-- a comment --> <html> </html>` both the comment and the `html` tag are root nodes).
### Fixed
- Worker going into infinite loop when decoding a document with more than one root node.
diff --git a/Makefile b/Makefile
index 87c0f41..5851262 100644
--- a/Makefile
+++ b/Makefile
@@ -1,59 +1,60 @@
# SPDX-FileCopyrightText: 2017-2019 myhtmlex authors <https://github.com/Overbryd/myhtmlex>
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
# SPDX-License-Identifier: LGPL-2.1-only
MIX = mix
CMAKE = cmake
CNODE_CFLAGS = -g -O2 -std=c99 -pedantic -Wcomment -Wextra -Wno-old-style-declaration -Wall
# ignore unused parameter warnings
CNODE_CFLAGS += -Wno-unused-parameter
# set erlang include path
ERLANG_PATH = $(shell erl -eval 'io:format("~s", [lists:concat([code:root_dir(), "/erts-", erlang:system_info(version)])])' -s init stop -noshell)
CNODE_CFLAGS += -I$(ERLANG_PATH)/include
# expecting myhtml as a submodule in c_src/
# that way we can pin a version and package the whole thing in hex
# hex does not allow for non-app related dependencies.
LXB_PATH = c_src/lexbor
LXB_STATIC = $(LXB_PATH)/liblexbor_static.a
CNODE_CFLAGS += -I$(LXB_PATH)/source
# avoid undefined reference errors to phtread_mutex_trylock
CNODE_CFLAGS += -lpthread
# C-Node
ERL_INTERFACE = $(wildcard $(ERLANG_PATH)/../lib/erl_interface-*)
CNODE_CFLAGS += -L$(ERL_INTERFACE)/lib
CNODE_CFLAGS += -I$(ERL_INTERFACE)/include
CNODE_LDFLAGS =
ifeq ($(OTP22_DEF),YES)
CNODE_CFLAGS += -DOTP_22_OR_NEWER
else
CNODE_LDFLAGS += -lerl_interface
endif
CNODE_LDFLAGS += -lei -pthread
.PHONY: all
all: priv/fasthtml_worker
$(LXB_STATIC): $(LXB_PATH)
# Sadly, build components separately seems to sporadically fail
cd $(LXB_PATH); cmake -DLEXBOR_BUILD_SEPARATELY=OFF -DLEXBOR_BUILD_SHARED=OFF
$(MAKE) -C $(LXB_PATH)
priv/fasthtml_worker: c_src/fasthtml_worker.c $(LXB_STATIC)
mkdir -p priv
$(CC) -o $@ $< $(LXB_STATIC) $(CNODE_CFLAGS) $(CNODE_LDFLAGS)
clean: clean-myhtml
$(RM) -r priv/myhtmlex*
$(RM) priv/fasthtml_worker
$(RM) myhtmlex-*.tar
$(RM) -r package-test
clean-myhtml:
$(MAKE) -C $(MYHTML_PATH) clean
diff --git a/README.md b/README.md
index c5d5f9d..d01eb3b 100644
--- a/README.md
+++ b/README.md
@@ -1,34 +1,35 @@
<!--
SPDX-FileCopyrightText: 2017-2019 myhtmlex authors <https://github.com/Overbryd/myhtmlex>
+SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
SPDX-License-Identifier: LGPL-2.1-only
-->
# FastHTML
A C Node wrapping lexborisov's [myhtml](https://github.com/lexborisov/myhtml).
Primarily used with [FastSanitize](https://git.pleroma.social/pleroma/fast_sanitize).
* Available as a hex package: `{:fast_html, "~> 2.0"}`
* [Documentation](https://hexdocs.pm/fast_html/fast_html.html)
## Benchmarks
The following table provides median times it takes to decode a string to a tree for html parsers that can be used from Elixir. Benchmarks were conducted on a machine with an `AMD Ryzen 9 3950X (32) @ 3.500GHz` CPU and 32GB of RAM. The `mix fast_html.bench` task can be used for running the benchmark by yourself.
| File/Parser | fast_html (Port) | mochiweb_html (erlang) | html5ever (Rust NIF) | Myhtmlex (NIF)¹ |
|----------------------|--------------------|------------------------|----------------------|----------------|
| document-large.html (6.9M) | 125.12 ms | 1778.34 ms | 395.21 ms | 327.17 ms |
| document-medium.html (85K) | 1.93 ms | 12.10 ms | 4.74 ms | 3.82 ms |
| document-small.html (25K)| 0.50 ms | 2.76 ms | 1.72 ms | 1.19 ms |
| fragment-large.html (33K)| 0.93 ms | 4.78 ms | 2.34 ms | 2.15 ms |
| fragment-small.html² (757B)| 44.60 μs | 42.13 μs | 43.58 μs | 289.71 μs |
Full benchmark output can be seen in [this snippet](https://git.pleroma.social/pleroma/elixir-libraries/fast_html/snippets/3128)
1. Myhtmlex has a C-Node mode, but it wasn't benchmarked here because it segfaults on `document-large.html`
2. The slowdown on `fragment-small.html` is due to Port overhead. Unlike html5ever and Myhtmlex in NIF mode, `fast_html` has the parser process isolated and communicates with it over stdio, so even if a fatal crash in the parser happens, it won't bring down the entire VM.
## Contribution / Bug Reports
* Please make sure you do `git submodule update` after a checkout/pull
* The project aims to be fully tested
diff --git a/c_src/fasthtml_worker.c b/c_src/fasthtml_worker.c
index 566672e..0030b2b 100644
--- a/c_src/fasthtml_worker.c
+++ b/c_src/fasthtml_worker.c
@@ -1,534 +1,537 @@
+// SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
+// SPDX-License-Identifier: LGPL-2.1-only
+
#include <stdlib.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <errno.h>
#include <ctype.h>
#ifndef _REENTRANT
#define _REENTRANT /* For some reason __erl_errno is undefined unless _REENTRANT is defined */
#endif
#include "ei.h"
#ifndef OTP_22_OR_NEWER
# include "erl_interface.h"
#endif
#define HEADER_SIZE 4
#include <lexbor/html/html.h>
#include "tstack.h"
#ifdef __GNUC__
# define AFP(x, y) __attribute__((format (printf, x, y)))
#else
# define AFP(x, y)
#endif
#ifdef __GNUC__
# define NORETURN __attribute__((noreturn))
#else
# define NORETURN
#endif
typedef struct _state_t {
ei_x_buff buffer;
} state_t;
typedef enum parse_flags_e {
FLAG_HTML_ATOMS = 1 << 0,
FLAG_NIL_SELF_CLOSING = 1 << 1,
FLAG_COMMENT_TUPLE3 = 1 << 2
} parse_flags_t;
char* read_packet(int *len);
static void handle_send(state_t * state);
static void err_term(ei_x_buff * response, const char * error_atom);
static parse_flags_t decode_parse_flags(state_t * state, int arity);
static void decode(state_t * state, ei_x_buff * response, lxb_html_document_t *document, bool fragment, lxb_dom_element_t *context_element, lxb_char_t * bin_data, size_t bin_size, parse_flags_t parse_flags);
static void build_tree(ei_x_buff * response, lxb_dom_node_t* tree, parse_flags_t parse_flags);
static void prepare_node_attrs(ei_x_buff * response, lxb_dom_node_t* node);
static inline char * lowercase(char * c);
static void panic(const char *fmt, ...) AFP(1, 2);
static void panic(const char *fmt, ...) {
char buf[4096];
va_list va;
va_start (va, fmt);
vsnprintf (buf, sizeof buf, fmt, va);
va_end (va);
fprintf (stderr, "fast_html worker: error: %s\n", buf);
exit (EXIT_FAILURE);
}
int main(int argc, const char *argv[]) {
state_t* state = calloc (1, sizeof(state_t));
#ifdef OTP_22_OR_NEWER
// initialize erlang client library
ei_init ();
#else
erl_init (NULL, -1);
#endif
ei_x_new (&state->buffer);
fflush (stdout);
while (true) {
int len;
char* buf = read_packet(&len);
ei_x_free(&state->buffer);
state->buffer.index = 0;
state->buffer.buff = buf;
state->buffer.buffsz = len;
handle_send (state);
}
// shutdown: free all state
ei_x_free (&state->buffer);
free (state);
return EXIT_SUCCESS;
}
/*
* Reads a packet from Erlang. The packet must be a standard {packet, 2}
* packet. This function aborts if any error is detected (including EOF).
*
* Returns: The number of bytes in the packet.
*/
char *read_packet(int *len)
{
char* io_buf = NULL; /* Buffer for file i/o. */
unsigned char header[HEADER_SIZE];
uint32_t packet_length; /* Length of current packet. */
uint32_t bytes_read;
uint32_t total_bytes_read;
/*
* Read the packet header.
*/
total_bytes_read = read(STDIN_FILENO, header, HEADER_SIZE);
if (total_bytes_read == 0) {
exit(0);
}
if (total_bytes_read != HEADER_SIZE) {
panic("Failed to read packet header, read: %d\n", total_bytes_read);
}
/*
* Get the length of this packet.
*/
packet_length = 0;
for (int i = 0; i < HEADER_SIZE; i++)
packet_length = (packet_length << 8) | header[i];
*len=packet_length;
if ((io_buf = (char *) malloc(packet_length)) == NULL) {
panic("insufficient memory for i/o buffer of size %d\n", packet_length);
}
/*
* Read the packet itself.
*/
total_bytes_read = 0;
while((bytes_read = read(STDIN_FILENO, (io_buf + total_bytes_read), (packet_length - total_bytes_read))))
total_bytes_read += bytes_read;
if (total_bytes_read != packet_length) {
free(io_buf);
panic("couldn't read packet of length %d, read: %d\r\n",
packet_length, total_bytes_read);
}
return io_buf;
}
// handle ERL_SEND message type.
// we expect a tuple with arity of 3 or 4 in state->buffer.
// we expect the first argument to be an atom (`decode` or `decode_fragment`),
// the second argument to be the HTML payload, and the
// third argument to be the argument list.
// In case of `decode_fragment`, the fourth argument should be
// the context tag name.
// any other message: respond with an {error, unknown_call} tuple.
static void handle_send (state_t * state)
{
// response holds our response, prepare it
ei_x_buff response;
ei_x_new (&response);
// check the protocol version, if it's unsupported, panic
int version;
if (ei_decode_version (state->buffer.buff, &state->buffer.index, &version) < 0)
panic ("malformed message - bad version (%d).", version);
// decode the tuple header
int arity;
if (ei_decode_tuple_header (state->buffer.buff, &state->buffer.index, &arity) < 0)
{
err_term (&response, "badmatch");
goto out;
}
char atom[MAXATOMLEN];
if (ei_decode_atom (state->buffer.buff, &state->buffer.index, atom) < 0)
{
err_term (&response, "badmatch");
goto out;
}
bool fragment = false;
if (strcmp (atom, "decode"))
{
if (strcmp (atom, "decode_fragment")) {
err_term (&response, "unknown_call");
goto out;
} else if (arity != 4) {
err_term (&response, "badmatch");
goto out;
} else {
fragment = true;
}
} else if (arity != 3) {
err_term (&response, "badmatch");
goto out;
}
// the next argument should be a binary, allocate it dynamically.
int bin_type, bin_size;
if (ei_get_type (state->buffer.buff, &state->buffer.index, &bin_type, &bin_size) < 0)
panic ("failed to decode binary size in message");
// verify the type
if (bin_type != ERL_BINARY_EXT)
{
err_term (&response, "badmatch");
goto out;
}
// decode the binary
char * bin_data = calloc (1, bin_size + 1);
if (ei_decode_binary (state->buffer.buff, &state->buffer.index, bin_data, NULL) < 0)
panic ("failed to decode binary in message");
// next should be the options list
if (ei_decode_list_header (state->buffer.buff, &state->buffer.index, &arity) < 0)
panic ("failed to decode options list header in message");
parse_flags_t parse_flags = decode_parse_flags (state, arity);
// Lists with items always have an empty list as their tail
if (arity != 0)
if (ei_decode_list_header (state->buffer.buff, &state->buffer.index, &arity) < 0)
panic ("failed to decode empty list header after option list in message");
lxb_html_document_t *document = lxb_html_document_create();
lxb_dom_element_t *context_element = NULL;
// if we are parsing a fragment, context tag name should come next
if (fragment) {
int context_bin_type, context_bin_size;
if (ei_get_type (state->buffer.buff, &state->buffer.index, &context_bin_type, &context_bin_size) < 0)
panic ("failed to decode binary size in message");
// verify the type
if (context_bin_type != ERL_BINARY_EXT)
{
err_term (&response, "badmatch");
goto out;
}
// decode the binary
char* context_bin_data = calloc (1, context_bin_size + 1);
if (ei_decode_binary (state->buffer.buff, &state->buffer.index, context_bin_data, NULL) < 0)
panic ("failed to decode context binary in message");
context_element = lxb_dom_document_create_element(&document->dom_document, (lxb_char_t*) context_bin_data, context_bin_size, NULL);
free (context_bin_data);
}
if (context_element && lxb_dom_element_tag_id(context_element) >= LXB_TAG__LAST_ENTRY) {
err_term (&response, "unknown_context_tag");
} else {
decode (state, &response, document, fragment, context_element, (lxb_char_t *) bin_data, bin_size, parse_flags);
}
lxb_html_document_destroy(document);
free (bin_data);
out: ;
// send response
unsigned char header[HEADER_SIZE];
uint32_t size = (uint32_t) response.index;
for (int i = HEADER_SIZE-1; i != -1; i--) {
header[i] = (unsigned char) size & 0xFF;
size = size >> 8;
}
write(STDOUT_FILENO, header, sizeof(header));
write(STDOUT_FILENO, response.buff, response.index);
// free response
ei_x_free (&response);
return;
}
static void err_term (ei_x_buff * response, const char * error_atom)
{
response->index = 0;
ei_x_encode_version (response);
ei_x_encode_tuple_header (response, 2);
ei_x_encode_atom (response, "error");
ei_x_encode_atom (response, error_atom);
}
static parse_flags_t decode_parse_flags (state_t * state, int arity)
{
parse_flags_t parse_flags = 0;
for (int i = 0; i < arity; i++)
{
char atom[MAXATOMLEN];
if (ei_decode_atom (state->buffer.buff, &state->buffer.index, atom) < 0)
continue;
if (! strcmp ("html_atoms", atom))
parse_flags |= FLAG_HTML_ATOMS;
else if (! strcmp ("nil_self_closing", atom))
parse_flags |= FLAG_NIL_SELF_CLOSING;
else if (! strcmp ("comment_tuple3", atom))
parse_flags |= FLAG_COMMENT_TUPLE3;
}
return parse_flags;
}
static void decode(state_t * state, ei_x_buff * response, lxb_html_document_t *document, bool fragment, lxb_dom_element_t *context_element, lxb_char_t * bin_data, size_t bin_size, parse_flags_t parse_flags)
{
// parse tree
lxb_status_t status;
lxb_dom_node_t *node;
if (fragment) {
node = lxb_html_document_parse_fragment(document, context_element, bin_data, bin_size);
status = (node == NULL)? LXB_STATUS_ERROR : LXB_STATUS_OK;
} else {
status = lxb_html_document_parse(document, bin_data, bin_size);
node = lxb_dom_interface_node(document);
}
if (status != LXB_STATUS_OK)
{
err_term (response, "parse_failed");
return;
}
// build tree
build_tree (response, node, parse_flags);
}
// a tag is sent as a tuple:
// - a string or atom for the tag name
// - an attribute list
// - a children list
// in this function, we prepare the atom and complete attribute list
static void prepare_tag_header (ei_x_buff * response, const char * tag_string, lxb_dom_node_t* node, parse_flags_t parse_flags)
{
lxb_tag_id_t tag_id = lxb_dom_node_tag_id(node);
ei_x_encode_tuple_header (response, 3);
if (! (parse_flags & FLAG_HTML_ATOMS) || (tag_id == LXB_TAG__UNDEF || tag_id >= LXB_TAG__LAST_ENTRY))
ei_x_encode_binary (response, tag_string, strlen (tag_string));
else
ei_x_encode_atom (response, tag_string);
prepare_node_attrs (response, node);
}
// prepare an attribute node
static void prepare_node_attrs(ei_x_buff * response, lxb_dom_node_t* node)
{
lxb_dom_attr_t *attr;
for (attr = lxb_dom_element_first_attribute(lxb_dom_interface_element(node)); attr != NULL; attr = lxb_dom_element_next_attribute(attr))
{
size_t attr_name_len;
char *attr_name = (char*) lxb_dom_attr_qualified_name(attr, &attr_name_len);
size_t attr_value_len;
const char *attr_value = (char*) lxb_dom_attr_value(attr, &attr_value_len);
/* guard against poisoned attribute nodes */
if (! attr_name_len)
continue;
ei_x_encode_list_header (response, 1);
ei_x_encode_tuple_header (response, 2);
ei_x_encode_binary (response, attr_name, attr_name_len);
ei_x_encode_binary (response, attr_value, attr_value_len);
}
ei_x_encode_empty_list (response);
}
// dump a comment node
static void prepare_comment (ei_x_buff * response, const char * node_comment, size_t comment_len, parse_flags_t parse_flags)
{
ei_x_encode_tuple_header (response, parse_flags & FLAG_COMMENT_TUPLE3 ? 3 : 2);
ei_x_encode_atom (response, "comment");
if (parse_flags & FLAG_COMMENT_TUPLE3)
ei_x_encode_list_header (response, 0);
ei_x_encode_binary (response, node_comment, comment_len);
}
#ifdef DEBUG_LIST_MANIP
#define EMIT_LIST_HDR \
printf ("list hdr for node %p\n", current_node); \
fflush (stdout); \
ei_x_encode_list_header (response, 1)
#define EMIT_EMPTY_LIST_HDR \
printf ("list empty for node %p\n", current_node); \
fflush (stdout); \
ei_x_encode_list_header (response, 0)
#define EMIT_LIST_TAIL \
printf ("list tail for node %p\n", current_node); \
fflush (stdout); \
ei_x_encode_empty_list (response)
#else
#define EMIT_LIST_HDR ei_x_encode_list_header (response, 1)
#define EMIT_EMPTY_LIST_HDR ei_x_encode_list_header (response, 0)
#define EMIT_LIST_TAIL ei_x_encode_empty_list (response)
#endif
static void build_tree (ei_x_buff * response, lxb_dom_node_t* node, parse_flags_t parse_flags)
{
tstack stack;
tstack_init (&stack, 30);
tstack_push (&stack, node);
lxb_dom_node_t* current_node = node->first_child;
// ok we're going to send an actual response so start encoding it
response->index = 0;
ei_x_encode_version (response);
ei_x_encode_tuple_header(response, 2);
ei_x_encode_atom(response, "ok");
if (current_node == NULL) {
EMIT_EMPTY_LIST_HDR;
EMIT_LIST_TAIL;
}
while (current_node != NULL)
{
if (current_node->type == LXB_DOM_NODE_TYPE_TEXT)
{
size_t text_len;
const char * node_text = (char*) lxb_dom_node_text_content(current_node, &text_len);
EMIT_LIST_HDR;
ei_x_encode_binary (response, node_text, text_len);
}
else if (current_node->type == LXB_DOM_NODE_TYPE_COMMENT)
{
size_t comment_len;
const char* node_comment = (char*) lxb_dom_node_text_content(current_node, &comment_len);
EMIT_LIST_HDR;
prepare_comment(response, node_comment, comment_len, parse_flags);
}
else if(current_node->type == LXB_DOM_NODE_TYPE_ELEMENT)
{
// get name of tag
size_t tag_name_len;
const char *tag_name = (char*) lxb_dom_element_qualified_name(lxb_dom_interface_element(current_node), &tag_name_len);
EMIT_LIST_HDR;
prepare_tag_header (response, tag_name, current_node, parse_flags);
if (current_node->first_child)
{
tstack_push (&stack, current_node);
current_node = current_node->first_child;
continue;
}
else
{
if (parse_flags & FLAG_NIL_SELF_CLOSING && lxb_html_tag_is_void(lxb_dom_node_tag_id(current_node))) {
#ifdef DEBUG_LIST_MANIP
printf ("self-closing tag %s emit nil?\n", tag_string); fflush (stdout);
#endif
ei_x_encode_atom (response, "nil");
}
else
{
EMIT_EMPTY_LIST_HDR;
}
}
}
if (current_node->next)
current_node = current_node->next;
else
{
while (! current_node->next && stack.used != 0)
{
EMIT_LIST_TAIL;
current_node = tstack_pop (&stack);
}
if (current_node->next)
current_node = current_node->next;
}
// are we at root?
if (current_node == node)
break;
}
tstack_free (&stack);
}
static inline char * lowercase(char* c)
{
char * p = c;
while (*p)
{
*p = tolower ((unsigned char) *p);
p++;
}
return c;
}
diff --git a/c_src/tstack.h b/c_src/tstack.h
index ef3f007..1f6223b 100644
--- a/c_src/tstack.h
+++ b/c_src/tstack.h
@@ -1,38 +1,41 @@
+// SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
+// SPDX-License-Identifier: LGPL-2.1-only
+
#ifndef TSTACK_H
#define TSTACK_H
#define GROW_BY 30
typedef struct {
lxb_dom_node_t **data;
size_t used;
size_t size;
} tstack;
void tstack_init(tstack *stack, size_t initial_size) {
stack->data = (lxb_dom_node_t **) malloc(initial_size * sizeof(lxb_dom_node_t *));
stack->used = 0;
stack->size = initial_size;
}
void tstack_free(tstack *stack) {
free(stack->data);
}
void tstack_resize(tstack *stack, size_t new_size) {
stack->data = (lxb_dom_node_t **) realloc(stack->data, new_size * sizeof(lxb_dom_node_t *));
stack->size = new_size;
}
void tstack_push(tstack *stack, lxb_dom_node_t * element) {
if(stack->used == stack->size) {
tstack_resize(stack, stack->size + GROW_BY);
}
stack->data[stack->used++] = element;
}
lxb_dom_node_t * tstack_pop(tstack *stack) {
return stack->data[--(stack->used)];
}
#endif
diff --git a/lib/fast_html.ex b/lib/fast_html.ex
index 723778e..0fc92bc 100644
--- a/lib/fast_html.ex
+++ b/lib/fast_html.ex
@@ -1,150 +1,151 @@
# SPDX-FileCopyrightText: 2017-2019 myhtmlex authors <https://github.com/Overbryd/myhtmlex>
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
# SPDX-License-Identifier: LGPL-2.1-only
defmodule :fast_html do
@moduledoc """
A module to decode html into a tree structure.
"""
@type tag() :: String.t() | atom()
@type attr() :: {String.t(), String.t()}
@type attr_list() :: [] | [attr()]
@type comment_node() :: {:comment, String.t()}
@type comment_node3() :: {:comment, [], String.t()}
@type tree() ::
{tag(), attr_list(), tree()}
| {tag(), attr_list(), nil}
| comment_node()
| comment_node3()
@type format_flag() :: :html_atoms | :nil_self_closing | :comment_tuple3
@doc """
Returns a tree representation from the given html string.
`opts` is a keyword list of options, the options available:
* `timeout` - Call timeout. If pooling is used and the worker doesn't return
the result in time, the worker will be killed with a warning.
* `format` - Format flags for the tree.
The following format flags are available:
* `:html_atoms` uses atoms for known html tags (faster), binaries for everything else.
* `:nil_self_closing` uses `nil` to designate void elements.
For example `<br>` is then being represented like `{"br", [], nil}`.
See http://w3c.github.io/html-reference/syntax.html#void-elements for a full list of void elements.
* `:comment_tuple3` uses 3-tuple elements for comments, instead of the default 2-tuple element.
## Examples
iex> :fast_html.decode("<h1>Hello world</h1>")
{:ok, [{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}]}
iex> :fast_html.decode("Hello world", timeout: 0)
{:error, :timeout}
iex> :fast_html.decode("<span class='hello'>Hi there</span>")
{:ok, [{"html", [],
[{"head", [], []},
{"body", [], [{"span", [{"class", "hello"}], ["Hi there"]}]}]}]}
iex> :fast_html.decode("<body><!-- a comment --!></body>")
{:ok, [{"html", [], [{"head", [], []}, {"body", [], [comment: " a comment "]}]}]}
iex> :fast_html.decode("<br>")
{:ok, [{"html", [], [{"head", [], []}, {"body", [], [{"br", [], []}]}]}]}
iex> :fast_html.decode("<h1>Hello world</h1>", format: [:html_atoms])
{:ok, [{:html, [], [{:head, [], []}, {:body, [], [{:h1, [], ["Hello world"]}]}]}]}
iex> :fast_html.decode("<br>", format: [:nil_self_closing])
{:ok, [{"html", [], [{"head", [], []}, {"body", [], [{"br", [], nil}]}]}]}
iex> :fast_html.decode("<body><!-- a comment --!></body>", format: [:comment_tuple3])
{:ok, [{"html", [], [{"head", [], []}, {"body", [], [{:comment, [], " a comment "}]}]}]}
iex> html = "<body><!-- a comment --!><unknown /></body>"
iex> :fast_html.decode(html, format: [:html_atoms, :nil_self_closing, :comment_tuple3])
{:ok, [{:html, [],
[{:head, [], []},
{:body, [], [{:comment, [], " a comment "}, {"unknown", [], []}]}]}]}
"""
@spec decode(String.t(), format: [format_flag()]) ::
{:ok, tree()} | {:error, String.t() | atom()}
def decode(bin, opts \\ []) do
flags = Keyword.get(opts, :format, [])
timeout = Keyword.get(opts, :timeout, 10000)
find_and_use_port({:decode, bin, flags}, timeout, opts)
end
@doc """
Like `decode/2`, but for parsing [HTML fragments](https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments).
`opts` is a keyword list of options, the options available are the same as in `decode/2` with addition of:
* `context` - Name of the context element, defaults to `div`
Example:
iex> :fast_html.decode_fragment("rin is the <i>best</i> girl")
{:ok, ["rin is the ", {"i", [], ["best"]}, " girl"]}
iex> :fast_html.decode_fragment("rin is the <i>best</i> girl", context: "title")
{:ok, ["rin is the <i>best</i> girl"]}
iex> :fast_html.decode_fragment("rin is the <i>best</i> girl", context: "objective_truth")
{:error, :unknown_context_tag}
iex> :fast_html.decode_fragment("rin is the <i>best</i> girl", format: [:html_atoms])
{:ok, ["rin is the ", {:i, [], ["best"]}, " girl"]}
"""
def decode_fragment(bin, opts \\ []) do
flags = Keyword.get(opts, :format, [])
timeout = Keyword.get(opts, :timeout, 10000)
context = Keyword.get(opts, :context, "div")
find_and_use_port({:decode_fragment, bin, flags, context}, timeout, opts)
end
@default_pool FastHtml.Pool
defp find_and_use_port(term_command, timeout, opts) do
command = :erlang.term_to_binary(term_command)
pool =
cond do
pool = Keyword.get(opts, :pool) -> pool
Application.get_env(:fast_html, :pool, enabled: true)[:enabled] -> @default_pool
true -> nil
end
execute_command_fun = fn port ->
send(port, {self(), {:command, command}})
receive do
{^port, {:data, res}} -> {:ok, res}
after
timeout ->
{:error, :timeout}
end
end
result =
if pool do
FastHtml.Pool.get_port(pool, execute_command_fun)
else
port = open_port()
result = execute_command_fun.(port)
Port.close(port)
result
end
case result do
{:ok, result} -> :erlang.binary_to_term(result)
{:error, _} = e -> e
end
end
def open_port do
Port.open({:spawn_executable, Path.join([:code.priv_dir(:fast_html), "fasthtml_worker"])}, [
:binary,
{:packet, 4},
:use_stdio,
:exit_status
])
end
end
diff --git a/lib/fast_html/application.ex b/lib/fast_html/application.ex
index 68b3c3b..0502701 100644
--- a/lib/fast_html/application.ex
+++ b/lib/fast_html/application.ex
@@ -1,15 +1,18 @@
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
+# SPDX-License-Identifier: LGPL-2.1-only
+
defmodule FastHtml.Application do
@moduledoc false
use Application
def start(_type, _args) do
default_pool_config = Application.get_env(:fast_html, :pool, enabled: true)
children = if default_pool_config[:enabled], do: [FastHtml.Pool], else: []
Supervisor.start_link(children,
strategy: :one_for_one,
name: FastHtml.Supervisor
)
end
end
diff --git a/lib/fast_html/pool.ex b/lib/fast_html/pool.ex
index 47a29e1..2032f28 100644
--- a/lib/fast_html/pool.ex
+++ b/lib/fast_html/pool.ex
@@ -1,132 +1,135 @@
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
+# SPDX-License-Identifier: LGPL-2.1-only
+
defmodule FastHtml.Pool do
@behaviour NimblePool
@moduledoc """
"""
require Logger
@doc false
def child_spec(opts) do
%{
id: __MODULE__,
start: {__MODULE__, :start_link, [opts]},
type: :worker,
restart: :permanent
}
end
@doc """
Starts the port pool.
### Options
- `:size` - Number of ports in the pool. Defaults to `System.schedulers_online/0` if not set.
- `:name` - Registered name of the pool. Defaults to `#{__MODULE__}` if not set, set to `false` to not register the process.
"""
@type option :: {:size, pos_integer()} | {:name, atom()}
@spec start_link([option()]) :: term()
def start_link(options) do
{size, options} = Keyword.pop(options, :size, System.schedulers_online())
NimblePool.start_link(worker: {__MODULE__, options}, pool_size: size)
end
@type pool :: atom() | pid()
@type result :: {:ok, term()} | {:error, atom()}
@spec get_port(pool(), (port() -> result())) :: result()
def get_port(pool, fun) do
NimblePool.checkout!(pool, :checkout, fn _from, port ->
result = fun.(port)
client_state =
case result do
{:ok, _} ->
:ok
{:error, reason} ->
reason
end
send(port, {self(), {:connect, GenServer.whereis(pool)}})
client_state =
receive do
{^port, :connected} -> client_state
{:EXIT, ^port, reason} -> {:EXIT, reason}
end
{result, client_state}
end)
end
@impl NimblePool
@doc false
def init_pool(state) do
{name, options} =
case Keyword.pop(state, :name) do
{nil, state} -> {__MODULE__, state}
{name, state} when is_atom(name) -> {name, state}
{_, state} -> {nil, state}
end
if name, do: Process.register(self(), name)
{:ok, options}
end
@impl NimblePool
@doc false
def init_worker(pool_state) do
port = :fast_html.open_port()
{:ok, port, pool_state}
end
@impl NimblePool
@doc false
def terminate_worker({:EXIT, reason}, port, pool_state) do
Logger.warn(fn ->
"[#{__MODULE__}]: Port #{port} unexpectedly exited with reason: #{reason}"
end)
{:ok, pool_state}
end
@impl NimblePool
@doc false
def terminate_worker(_reason, port, pool_state) do
Port.close(port)
{:ok, pool_state}
end
@impl NimblePool
@doc false
def handle_checkout(:checkout, {client_pid, _}, port, pool_state) do
send(port, {self(), {:connect, client_pid}})
receive do
{^port, :connected} -> {:ok, port, port, pool_state}
{:EXIT, ^port, reason} -> {:remove, {:EXIT, reason}}
end
end
@impl NimblePool
@doc false
def handle_checkin(:timeout, _, _, pool_state), do: {:remove, :timeout, pool_state}
@impl NimblePool
@doc false
def handle_checkin(_, _, port, pool_state), do: {:ok, port, pool_state}
@impl NimblePool
@doc false
def handle_info({:EXIT, port, reason}, port), do: {:remove, {:EXIT, reason}}
@impl NimblePool
@doc false
def handle_info({:EXIT, _, _}, port), do: {:ok, port}
# Port sent data to the pool, this happens when the timeout was reached
# and the port got disconnected from the client, but not yet killed by the pool.
# Just discard the message.
@impl NimblePool
@doc false
def handle_info({_sending_port, {:data, _}}, port), do: {:ok, port}
end
diff --git a/lib/mix/tasks/fast_html/bench.ex b/lib/mix/tasks/fast_html/bench.ex
index 32823ee..1198615 100644
--- a/lib/mix/tasks/fast_html/bench.ex
+++ b/lib/mix/tasks/fast_html/bench.ex
@@ -1,31 +1,34 @@
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
+# SPDX-License-Identifier: LGPL-2.1-only
+
if Mix.env() == :bench do
defmodule Mix.Tasks.FastHtml.Bench do
@moduledoc "Benchmarking task."
use Mix.Task
@input_dir "bench_fixtures"
def run(_) do
Application.ensure_all_started(:fast_html)
inputs =
Enum.reduce(File.ls!(@input_dir), %{}, fn input_name, acc ->
input = File.read!(Path.join(@input_dir, input_name))
Map.put(acc, input_name, input)
end)
Benchee.run(
%{
"fast_html" => fn input -> :fast_html.decode(input) end,
"myhtmlex nif" => fn input -> Myhtmlex.Nif.decode(input) end,
"html5ever nif" => fn input -> Html5ever.parse(input) end,
"mochiweb_html" => fn input -> :mochiweb_html.parse(input) end
},
inputs: inputs,
save: [path: "fast_html.bench"],
load: "fast_html.bench"
)
end
end
end
diff --git a/mix.exs b/mix.exs
index a98ac53..06f1b7d 100644
--- a/mix.exs
+++ b/mix.exs
@@ -1,116 +1,117 @@
# SPDX-FileCopyrightText: 2017-2019 myhtmlex authors <https://github.com/Overbryd/myhtmlex>
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
# SPDX-License-Identifier: LGPL-2.1-only
defmodule FastHtml.Mixfile do
use Mix.Project
def project do
[
app: :fast_html,
version: "2.0.5",
elixir: "~> 1.5",
deps: deps(),
package: package(),
compilers: [:elixir_make] ++ Mix.compilers(),
make_env: make_env(),
make_error_message: make_error_message(),
build_embedded: Mix.env() == :prod,
start_permanent: Mix.env() == :prod,
name: "FastHtml",
description: """
A module to decode HTML into a tree,
porting all properties of the underlying
library lexbor, being fast and correct
in regards to the html spec.
""",
docs: docs()
]
end
def package do
[
maintainers: ["Ariadne Conill", "rinpatch"],
licenses: ["LGPL-2.1-only"],
links: %{
"GitLab" => "https://git.pleroma.social/pleroma/elixir-libraries/fast_html/",
"Issues" => "https://git.pleroma.social/pleroma/elixir-libraries/fast_html/issues",
"lexbor" => "https://github.com/lexbor/lexbor"
},
files: hex_files()
]
end
def application do
[
extra_applications: [:logger],
mod: {FastHtml.Application, []}
]
end
defp deps do
[
# documentation helpers
{:ex_doc, "~> 0.19", only: :dev},
# benchmarking helpers
{:benchee, "~> 1.0", only: :bench, optional: true},
{:dialyxir, "~> 1.0", only: [:dev, :test], runtime: false},
{:myhtmlex, "~> 0.2.0", only: :bench, runtime: false, optional: true},
{:mochiweb, "~> 2.18", only: :bench, optional: true},
{:html5ever,
git: "https://github.com/rusterlium/html5ever_elixir.git", only: :bench, optional: true},
{:nimble_pool, "~> 0.2.0"},
{:elixir_make, "~> 0.4", runtime: false}
]
end
defp docs do
[
main: "readme",
extras: ["README.md", "CHANGELOG.md"]
]
end
defp hex_files do
# This is run every time mix is executed, so it will fail in the hex package,
# therefore check if git is even available
if File.exists?(".git") and System.find_executable("git") do
{files, 0} = System.cmd("git", ["ls-files", "--recurse-submodules"])
files
|> String.split("\n")
# Last element is "", which makes hex include all files in the folder to the project
|> List.delete_at(-1)
|> Enum.reject(fn path ->
Path.dirname(path) == "bench_fixtures" or
(Path.dirname(path) != "priv" and String.starts_with?(Path.basename(path), "."))
end)
else
[]
end
end
defp otp_version do
:erlang.system_info(:otp_release)
|> to_string()
|> String.to_integer()
end
defp otp_22_or_newer? do
otp_version() >= 22
end
defp make_env do
%{
"OTP22_DEF" =>
if otp_22_or_newer?() do
"YES"
else
"NO"
end
}
end
defp make_error_message,
do:
"Please check you have: a C compiler, GNU\Make, CMake and Erlang development headers installed before reporting an issue."
end
diff --git a/mix.lock.license b/mix.lock.license
index cd7ec38..e7038e9 100644
--- a/mix.lock.license
+++ b/mix.lock.license
@@ -1,4 +1,4 @@
SPDX-FileCopyrightText: 2017-2019 myhtmlex authors <https://github.com/Overbryd/myhtmlex>
-SPDX-FileCopyrightText: 2019-2022 Pleroma Authors
+SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
SPDX-License-Identifier: LGPL-2.1-only
diff --git a/test/fast_html_test.exs b/test/fast_html_test.exs
index 588732d..d243c0a 100644
--- a/test/fast_html_test.exs
+++ b/test/fast_html_test.exs
@@ -1,168 +1,169 @@
# SPDX-FileCopyrightText: 2017-2019 myhtmlex authors <https://github.com/Overbryd/myhtmlex>
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
# SPDX-License-Identifier: LGPL-2.1-only
defmodule :fast_html_test do
use ExUnit.Case
doctest :fast_html
test "doesn't segfault when <!----> is encountered" do
assert {:ok, [{"html", _attrs, _children}]} = :fast_html.decode("<div> <!----> </div>")
end
test "builds a tree, formatted like mochiweb by default" do
assert {:ok,
[
{"html", [],
[
{"head", [], []},
{"body", [],
[
{"br", [], []}
]}
]}
]} = :fast_html.decode("<br>")
end
test "builds a tree, html tags as atoms" do
assert {:ok,
[
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:br, [], []}
]}
]}
]} = :fast_html.decode("<br>", format: [:html_atoms])
end
test "builds a tree, nil self closing" do
assert {:ok,
[
{"html", [],
[
{"head", [], []},
{"body", [],
[
{"br", [], nil},
{"esi:include", [], []}
]}
]}
]} = :fast_html.decode("<br><esi:include />", format: [:nil_self_closing])
end
test "builds a tree, multiple format options" do
assert {:ok,
[
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:br, [], nil}
]}
]}
]} = :fast_html.decode("<br>", format: [:html_atoms, :nil_self_closing])
end
test "attributes" do
assert {:ok,
[
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:span, [{"id", "test"}, {"class", "foo garble"}], []}
]}
]}
]} =
:fast_html.decode(~s'<span id="test" class="foo garble"></span>',
format: [:html_atoms]
)
end
test "single attributes" do
assert {:ok,
[
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:button, [{"disabled", ""}, {"class", "foo garble"}], []}
]}
]}
]} =
:fast_html.decode(~s'<button disabled class="foo garble"></span>',
format: [:html_atoms]
)
end
test "text nodes" do
assert {:ok,
[
{:html, [],
[
{:head, [], []},
{:body, [],
[
"text node"
]}
]}
]} = :fast_html.decode(~s'<body>text node</body>', format: [:html_atoms])
end
test "broken input" do
assert {:ok,
[
{:html, [],
[
{:head, [], []},
{:body, [],
[
{:a, [{"<", ""}], [" asdf"]}
]}
]}
]} = :fast_html.decode(~s'<a <> asdf', format: [:html_atoms])
end
test "custom namespaced tags" do
assert {:ok,
[
{:html, [],
[
{:head, [], []},
{:body, [],
[
{"esi:include", [], []}
]}
]}
]} = :fast_html.decode(~s'<esi:include />', format: [:html_atoms, :nil_self_closing])
end
test "html comments" do
assert {:ok,
[
{:html, [],
[
{:head, [], []},
{:body, [],
[
comment: " a comment "
]}
]}
]} = :fast_html.decode(~s'<body><!-- a comment --></body>', format: [:html_atoms])
end
test "doesn't go into an infinite loop when there are more than one root tags" do
assert {:ok, [{:comment, " a comment "}, {"html", [], [{"head", [], []}, {"body", [], []}]}]} ==
:fast_html.decode("<!-- a comment --> <html> </html>")
end
test "doesn't return attribute name in attribute value when the latter is empty" do
assert :fast_html.decode_fragment("<meta content=\"\"/>") ==
{:ok, [{"meta", [{"content", ""}], []}]}
end
end
diff --git a/test/test_helper.exs b/test/test_helper.exs
index 2f44730..021039e 100644
--- a/test/test_helper.exs
+++ b/test/test_helper.exs
@@ -1,4 +1,5 @@
# SPDX-FileCopyrightText: 2017-2019 myhtmlex authors <https://github.com/Overbryd/myhtmlex>
+# SPDX-FileCopyrightText: 2019-2022 Pleroma Authors <https://pleroma.social>
# SPDX-License-Identifier: LGPL-2.1-only
ExUnit.start()

File Metadata

Mime Type
text/x-diff
Expires
Sun, Nov 24, 10:33 PM (1 d, 9 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
39491
Default Alt Text
(45 KB)

Event Timeline