Page MenuHomePhorge

No OneTemporary

Size
31 KB
Referenced Files
None
Subscribers
None
diff --git a/Makefile b/Makefile
index 38a44e2..2bb3bdf 100644
--- a/Makefile
+++ b/Makefile
@@ -1,96 +1,93 @@
MIX = mix
MYHTMLEX_CFLAGS = -g -O2 -std=c99 -pedantic -Wcomment -Wall
# we need to compile position independent code
MYHTMLEX_CFLAGS += -fpic -DPIC
# For some reason __erl_errno is undefined unless _REENTRANT is defined
MYHTMLEX_CFLAGS += -D_REENTRANT
# myhtmlex is using stpcpy, as defined in gnu string.h
# MYHTMLEX_CFLAGS += -D_GNU_SOURCE
# base on the same posix c source as myhtml
# MYHTMLEX_CFLAGS += -D_POSIX_C_SOURCE=199309
# turn warnings into errors
# MYHTMLEX_CFLAGS += -Werror
# ignore unused variables
# MYHTMLEX_CFLAGS += -Wno-unused-variable
# ignore unused parameter warnings
MYHTMLEX_CFLAGS += -Wno-unused-parameter
# set erlang include path
ERLANG_PATH = $(shell erl -eval 'io:format("~s", [lists:concat([code:root_dir(), "/erts-", erlang:system_info(version)])])' -s init stop -noshell)
MYHTMLEX_CFLAGS += -I$(ERLANG_PATH)/include
# expecting myhtml as a submodule in c_src/
# that way we can pin a version and package the whole thing in hex
# hex does not allow for non-app related dependencies.
MYHTML_PATH = c_src/myhtml
MYHTML_STATIC = $(MYHTML_PATH)/lib/libmyhtml_static.a
MYHTMLEX_CFLAGS += -I$(MYHTML_PATH)/include
# avoid undefined reference errors to phtread_mutex_trylock
MYHTMLEX_CFLAGS += -lpthread
# that would be used for a dynamically linked build
# MYHTMLEX_CFLAGS += -L$(MYHTML_PATH)/lib
MYHTMLEX_LDFLAGS = -shared
# C-Node
ERL_INTERFACE = $(wildcard $(ERLANG_PATH)/../lib/erl_interface-*)
CNODE_CFLAGS = $(MYHTMLEX_CFLAGS)
CNODE_CFLAGS += -L$(ERL_INTERFACE)/lib
CNODE_CFLAGS += -I$(ERL_INTERFACE)/include
CNODE_CFLAGS += -lerl_interface -lei
# enumerate docker build tests
BUILD_TESTS := $(patsubst %.dockerfile, %.dockerfile.PHONY, $(wildcard ./build-test/*.dockerfile))
# platform specific environment
UNAME = $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
MYHTMLEX_LDFLAGS += -dynamiclib -undefined dynamic_lookup
else
# myhtmlex is using stpcpy, as defined in gnu string.h
MYHTMLEX_CFLAGS += -D_GNU_SOURCE
# base on the same posix c source as myhtml
# MYHTMLEX_CFLAGS += -D_POSIX_C_SOURCE=199309
endif
.PHONY: all
all: myhtmlex
-myhtmlex: priv/myhtmlex.so
+myhtmlex: priv/myhtml_worker
$(MIX) compile
$(MYHTML_STATIC): $(MYHTML_PATH)
$(MAKE) -C $(MYHTML_PATH) library MyCORE_BUILD_WITHOUT_THREADS=YES
-priv/myhtmlex.so: c_src/myhtmlex.c $(MYHTML_STATIC)
- $(CC) $(MYHTMLEX_CFLAGS) $(MYHTMLEX_LDFLAGS) -o $@ $< $(MYHTML_STATIC)
-
priv/myhtml_worker: c_src/myhtml_worker.c $(MYHTML_STATIC)
$(CC) -o $@ $< $(MYHTML_STATIC) $(CNODE_CFLAGS)
clean: clean-myhtml
$(RM) -r priv/myhtmlex*
$(RM) priv/myhtml_worker
$(RM) myhtmlex-*.tar
$(RM) -r package-test
clean-myhtml:
$(MAKE) -C $(MYHTML_PATH) clean
# publishing the package and docs separately is required
# otherwise the build artifacts are included in the package
# and the tarball gets too big to be published
publish: clean
$(MIX) hex.publish package
$(MIX) hex.publish docs
test:
$(MIX) test
build-tests: test $(BUILD_TESTS)
%.dockerfile.PHONY: %.dockerfile
docker build -f $< .
diff --git a/README.md b/README.md
index e1b62d8..65a3cdc 100644
--- a/README.md
+++ b/README.md
@@ -1,94 +1,47 @@
# Myhtmlex
Bindings for lexborisov's [myhtml](https://github.com/lexborisov/myhtml).
* Available as a hex package: `{:myhtmlex, "~> 0.2.0"}`
* [Documentation](https://hexdocs.pm/myhtmlex/Myhtmlex.html)
## Example
iex> Myhtmlex.decode("<h1>Hello world</h1>")
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}
- Benchmark results (Nif calling mode) on various file sizes on a 2,5Ghz Core i7:
+ Benchmark results (removed Nif calling mode) on various file sizes on a 2,5Ghz Core i7:
Settings:
duration: 1.0 s
## FileSizesBench
[15:28:42] 1/3: github_trending_js.html 341k
[15:28:46] 2/3: w3c_html5.html 131k
[15:28:48] 3/3: wikipedia_hyperlink.html 97k
Finished in 7.52 seconds
## FileSizesBench
benchmark name iterations average time
wikipedia_hyperlink.html 97k 1000 1385.86 µs/op
w3c_html5.html 131k 1000 2179.30 µs/op
github_trending_js.html 341k 500 5686.21 µs/op
-## Configuration
-
-The module you are calling into is always `Myhtmlex` and depending on your application configuration,
-it chooses between the underlying implementations `Myhtmlex.Safe` (default) and `Myhtmlex.Nif`.
-
-Erlang interoperability is a tricky mine-field.
-You can call into C directly using native implemented functions (Nif). But this comes with the risk,
-that if anything goes wrong within the C implementation, your whole VM will crash.
-No more supervisor cushions for here on, just violent crashes.
-
-That is why the default mode of operation keeps your VM safe and happy.
-If you need ultimate parsing speed, or you can simply tolerate VM-level crashes, read on.
-
-### Call into C-Node (default)
-
-This is the default mode of operation.
-If your application cannot tolerate VM-level crashes, this option allows you to gain the best of both worlds.
-The added overhead is client/server communications, and a worker OS-process that runs next to your VM under VM supervision.
-
-You do not have to do anything to start the worker process, everything is taken care of within the library.
-If you are not running in distributed mode, your VM will automatically be assigned a `sname`.
-
-The worker OS-process stays alive as long as it is under VM-supervision. If your VM goes down, the OS-process will die by itself.
-If the worker OS-process dies for some reason, your VM stays unaffected and will attempt to restart it seamlessly.
-
-### Call into Nif
-
-If your application is aiming for ultimate parsing speed, and in the worst case can tolerate VM-level crashes, you can call directly into the Nif.
-
-1. Require myhtmlex without runtime
-
- in your `mix.exs`
-
- def deps do
- [
- {:myhtmlex, ">= 0.0.0", runtime: false}
- ]
- end
-
-2. Configure the mode to `Myhtmlex.Nif`
-
- e.g. in `config/config.exs`
-
- config :myhtmlex, mode: Myhtmlex.Nif
-
-3. Bonus: You can [open up in-memory references to parsed trees](https://hexdocs.pm/myhtmlex/Myhtmlex.html#open/1), without parsing + mapping erlang terms in one go
-
## Contribution / Bug Reports
* Please make sure you do `git submodule update` after a checkout/pull
* If you have problems building the project, please consider adding a Dockerfile to `build-tests/` to replicate the build error
* The project aims to be fully tested
## Roadmap
The exposed functions on `Myhtmlex` are not subject to change.
This project is under active development.
* [ ] Expose node-retrieval functions
* [x] Parse a HTML-document into a tree
* [x] Investigate safety and calling options
* [x] Call as dirty-nif
* [x] Call as C-Node (check branch `c-node`)
diff --git a/c_src/myhtmlex.c b/c_src/myhtmlex.c
deleted file mode 100644
index 7718370..0000000
--- a/c_src/myhtmlex.c
+++ /dev/null
@@ -1,426 +0,0 @@
-#include "myhtmlex.h"
-
-char*
-lowercase(char* c)
-{
- char* p = c;
- while(*p)
- {
- *p = tolower((unsigned char)*p);
- p++;
- }
- return c;
-}
-
-ERL_NIF_TERM
-make_atom(ErlNifEnv* env, const char* name)
-{
- ERL_NIF_TERM ret;
- if(enif_make_existing_atom(env, name, &ret, ERL_NIF_LATIN1)) {
- return ret;
- }
- return enif_make_atom(env, name);
-}
-
-ERL_NIF_TERM
-nif_open(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
- ERL_NIF_TERM result;
- myhtmlex_ref_t* ref;
-
- // fetch nif state
- myhtmlex_state_t* state = (myhtmlex_state_t*) enif_priv_data(env);
-
- // placeholder for the html binary we want to read from erlang caller
- ErlNifBinary html_bin;
- // read binary into &html_bin from argv[0] (first argument)
- if (!enif_inspect_iolist_as_binary(env, argv[0], &html_bin))
- {
- // blame the user if html_bin is not a binary
- return enif_make_badarg(env);
- }
-
- ref = enif_alloc_resource(state->myhtml_tree_rt, sizeof(myhtmlex_ref_t));
- ref->tree = myhtml_tree_create();
- myhtml_tree_init(ref->tree, state->myhtml);
- mystatus_t status = myhtml_parse(ref->tree, MyENCODING_UTF_8, (char*) html_bin.data, (size_t) html_bin.size);
- if (status != MyHTML_STATUS_OK)
- {
- // TODO: what is the correct reaction for a not ok state?
- return enif_make_badarg(env);
- }
- ref->root = myhtml_tree_get_document(ref->tree);
-
- result = enif_make_resource(env, ref);
- return result;
-}
-
-ERL_NIF_TERM
-nif_decode_tree(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
- ERL_NIF_TERM result;
- myhtmlex_ref_t* ref;
- unsigned char parse_flags = 0;
-
- // fetch nif state
- myhtmlex_state_t* state = (myhtmlex_state_t*) enif_priv_data(env);
-
- // fetch reference
- if (!enif_get_resource(env, argv[0], state->myhtml_tree_rt, (void **) &ref))
- {
- return enif_make_badarg(env);
- }
- // we should have received format flags in a list
- if (argc == 2)
- {
- if (!enif_is_list(env, argv[1]))
- {
- // blame the user if second argument is not a list
- return enif_make_badarg(env);
- }
- parse_flags = read_parse_flags(env, &argv[1]);
- }
-
- // build erlang tree
- result = build_tree(env, ref->tree, myhtml_node_last_child(ref->root), &parse_flags);
-
- // return tree to erlang
- return result;
-}
-
-ERL_NIF_TERM
-nif_decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]) {
- ERL_NIF_TERM result;
- unsigned char parse_flags = 0;
-
- // fetch nif state
- myhtmlex_state_t* state = (myhtmlex_state_t*) enif_priv_data(env);
-
- // placeholder for the html binary we want to read from erlang caller
- ErlNifBinary html_bin;
- // read binary into &html_bin from argv[0] (first argument)
- if (argc < 1 || !enif_inspect_iolist_as_binary(env, argv[0], &html_bin))
- {
- // blame the user if html_bin is not a binary
- return enif_make_badarg(env);
- }
- // we should have received format flags in a list
- if (argc == 2)
- {
- if (!enif_is_list(env, argv[1]))
- {
- // blame the user if second argument is not a list
- return enif_make_badarg(env);
- }
- parse_flags = read_parse_flags(env, &argv[1]);
- }
-
- // parse html into tree
- // use parse_single for now, threaded mode is buggy with some files
- mystatus_t status = myhtml_parse(state->tree, MyENCODING_UTF_8, (char*) html_bin.data, (size_t) html_bin.size);
- if (status != MyHTML_STATUS_OK)
- {
- // TODO: what is the correct reaction for a not ok state?
- return enif_make_badarg(env);
- }
-
- // build erlang tree
- myhtml_tree_node_t *root = myhtml_tree_get_document(state->tree);
- result = build_tree(env, state->tree, myhtml_node_last_child(root), &parse_flags);
-
- // return tree to erlang
- return result;
-}
-
-unsigned char
-read_parse_flags(ErlNifEnv* env, const ERL_NIF_TERM* options)
-{
- unsigned char parse_flags = 0;
- ERL_NIF_TERM flag;
-
- while (enif_get_list_cell(env, *options, &flag, (ERL_NIF_TERM*)options))
- {
- if (!enif_is_atom(env, flag)) return enif_make_badarg(env);
- // set parse flags
- if (enif_compare(flag, ATOM_HTML_ATOMS) == 0)
- {
- parse_flags |= FLAG_HTML_ATOMS;
- }
- else if (enif_compare(flag, ATOM_NIL_SELF_CLOSING) == 0)
- {
- parse_flags |= FLAG_NIL_SELF_CLOSING;
- }
- else if (enif_compare(flag, ATOM_COMMENT_TUPLE3) == 0)
- {
- parse_flags |= FLAG_COMMENT_TUPLE3;
- }
- }
-
- return parse_flags;
-}
-
-ERL_NIF_TERM
-build_node_children(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* parent, unsigned char* parse_flags)
-{
- if (myhtml_node_is_close_self(parent) && (*parse_flags & FLAG_NIL_SELF_CLOSING))
- {
- return ATOM_NIL;
- }
-
- myhtml_tree_node_t* child = myhtml_node_last_child(parent);
- if (child == NULL)
- {
- if (myhtml_node_is_void_element(parent) && (*parse_flags & FLAG_NIL_SELF_CLOSING))
- {
- return ATOM_NIL;
- }
- else
- {
- return EMPTY_LIST;
- }
- }
-
- ERL_NIF_TERM list = enif_make_list(env, 0);
-
- while (child)
- {
- ERL_NIF_TERM node_tuple = build_tree(env, tree, child, parse_flags);
- list = enif_make_list_cell(env, node_tuple, list);
-
- // get previous child, building the list from reverse
- child = myhtml_node_prev(child);
- }
-
- return list;
-}
-
-ERL_NIF_TERM
-build_node_attrs(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node)
-{
- myhtml_tree_attr_t* attr = myhtml_node_attribute_last(node);
- if (attr == NULL)
- {
- return EMPTY_LIST;
- }
-
- ERL_NIF_TERM list = enif_make_list(env, 0);
-
- while (attr)
- {
- ErlNifBinary name;
- ERL_NIF_TERM name_bin;
- ErlNifBinary value;
- ERL_NIF_TERM value_bin;
- ERL_NIF_TERM attr_tuple;
-
- size_t attr_name_len;
- const char *attr_name = myhtml_attribute_key(attr, &attr_name_len);
- size_t attr_value_len;
- const char *attr_value = myhtml_attribute_value(attr, &attr_value_len);
-
- if (attr_value) {
- enif_alloc_binary(attr_value_len, &value);
- memcpy(value.data, attr_value, attr_value_len);
- value_bin = enif_make_binary(env, &value);
- } else {
- enif_alloc_binary(attr_name_len, &value);
- memcpy(value.data, attr_name, attr_name_len);
- value_bin = enif_make_binary(env, &value);
- }
- enif_alloc_binary(attr_name_len, &name);
- memcpy(name.data, attr_name, attr_name_len);
- name_bin = enif_make_binary(env, &name);
-
- attr_tuple = enif_make_tuple2(env, name_bin, value_bin);
- list = enif_make_list_cell(env, attr_tuple, list);
-
- // get prev attribute, building the list from reverse
- attr = myhtml_attribute_prev(attr);
- }
-
- return list;
-}
-
-ERL_NIF_TERM
-build_tree(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node, unsigned char* parse_flags)
-{
- ERL_NIF_TERM result;
- myhtml_tag_id_t tag_id = myhtml_node_tag_id(node);
- myhtml_namespace_t tag_ns = myhtml_node_namespace(node);
-
- if (tag_id == MyHTML_TAG__TEXT)
- {
- ErlNifBinary text;
- size_t text_len;
- const char* node_text = myhtml_node_text(node, &text_len);
- enif_alloc_binary(text_len, &text);
- memcpy(text.data, node_text, text_len);
-
- result = enif_make_binary(env, &text);
- }
- else if (tag_id == MyHTML_TAG__COMMENT)
- {
- ErlNifBinary comment;
- size_t comment_len;
- const char* node_comment = myhtml_node_text(node, &comment_len);
- enif_alloc_binary(comment_len, &comment);
- memcpy(comment.data, node_comment, comment_len);
-
- if (*parse_flags & FLAG_COMMENT_TUPLE3)
- {
- result = enif_make_tuple3(env,
- ATOM_COMMENT,
- EMPTY_LIST,
- enif_make_binary(env, &comment)
- );
- }
- else
- {
- result = enif_make_tuple2(env, ATOM_COMMENT, enif_make_binary(env, &comment));
- }
- }
- else
- {
- ERL_NIF_TERM tag;
- ERL_NIF_TERM children;
- ERL_NIF_TERM attrs;
-
- // get name of tag
- size_t tag_name_len;
- const char *tag_name = myhtml_tag_name_by_id(tree, tag_id, &tag_name_len);
- // get namespace of tag
- size_t tag_ns_len;
- const char *tag_ns_name_ptr = myhtml_namespace_name_by_id(tag_ns, &tag_ns_len);
- char *tag_ns_buffer;
- char buffer [tag_ns_len + tag_name_len + 1];
- char *tag_string = buffer;
- size_t tag_string_len;
-
- if (tag_ns != MyHTML_NAMESPACE_HTML)
- {
- // tag_ns_name_ptr is unmodifyable, copy it in our tag_ns_buffer to make it modifyable.
- tag_ns_buffer = malloc(tag_ns_len);
- strcpy(tag_ns_buffer, tag_ns_name_ptr);
- // lowercase tag buffer (can be removed, just a nice to have)
- tag_ns_buffer = lowercase(tag_ns_buffer);
- // prepend namespace to tag name, e.g. "svg:path"
- stpcpy(stpcpy(stpcpy(tag_string, tag_ns_buffer), ":"), tag_name);
- tag_string_len = tag_ns_len + tag_name_len + 1; // +1 for colon
- }
- else
- {
- stpcpy(tag_string, tag_name);
- tag_string_len = tag_name_len;
- }
-
- // put unknown and non-html tags it in a binary
- if (!(*parse_flags & FLAG_HTML_ATOMS) || (tag_id == MyHTML_TAG__UNDEF || tag_id == MyHTML_TAG_LAST_ENTRY || tag_ns != MyHTML_NAMESPACE_HTML))
- {
- ErlNifBinary tag_b;
- enif_alloc_binary(tag_string_len, &tag_b);
- memcpy(tag_b.data, tag_string, tag_string_len);
- tag = enif_make_binary(env, &tag_b);
- }
- else
- {
- tag = make_atom(env, tag_string);
- }
-
- // attributes
- attrs = build_node_attrs(env, tree, node);
-
- // add children or nil as a self-closing flag
- children = build_node_children(env, tree, node, parse_flags);
-
- // free allocated resources
- if (tag_ns != MyHTML_NAMESPACE_HTML)
- {
- free(tag_ns_buffer);
- }
-
- result = enif_make_tuple3(env,
- tag,
- attrs,
- children
- );
- }
-
- return result;
-}
-
-void
-nif_cleanup_myhtmlex_ref(ErlNifEnv* env, void* obj)
-{
- myhtmlex_ref_t* ref = (myhtmlex_ref_t*) obj;
- // release myhtml resources
- myhtml_tree_destroy(ref->tree);
-}
-
-// Erlang NIF
-
-static int
-load(ErlNifEnv *env, void **priv, ERL_NIF_TERM info)
-{
- myhtmlex_state_t* state = enif_alloc(sizeof(myhtmlex_state_t));
- if (state == NULL)
- {
- return 1;
- }
-
- state->myhtml_tree_rt = enif_open_resource_type(
- env,
- NULL,
- "myhtmlex_ref_t",
- &nif_cleanup_myhtmlex_ref,
- ERL_NIF_RT_CREATE | ERL_NIF_RT_TAKEOVER,
- NULL
- );
- ATOM_NIL = make_atom(env, "nil");
- ATOM_COMMENT = make_atom(env, "comment");
- ATOM_HTML_ATOMS = make_atom(env, "html_atoms");
- ATOM_NIL_SELF_CLOSING = make_atom(env, "nil_self_closing");
- ATOM_COMMENT_TUPLE3 = make_atom(env, "comment_tuple3");
- EMPTY_LIST = enif_make_list(env, 0);
-
- // myhtml basic init
- state->myhtml = myhtml_create();
- myhtml_init(state->myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
- state->tree = myhtml_tree_create();
- myhtml_tree_init(state->tree, state->myhtml);
-
- *priv = (void*) state;
- return 0;
-}
-
-static int
-reload(ErlNifEnv *env, void **priv, ERL_NIF_TERM info)
-{
- return 0;
-}
-
-static int
-upgrade(ErlNifEnv *env, void **priv, void **old_priv, ERL_NIF_TERM info)
-{
- return load(env, priv, info);
-}
-
-static void
-unload(ErlNifEnv *env, void *priv)
-{
- myhtmlex_state_t* state = (myhtmlex_state_t*) priv;
-
- myhtml_tree_destroy(state->tree);
- myhtml_destroy(state->myhtml);
- enif_free(priv);
- return;
-}
-
-static ErlNifFunc funcs[] =
-{
- {"decode", 1, nif_decode, ERL_NIF_DIRTY_JOB_CPU_BOUND},
- {"decode", 2, nif_decode, ERL_NIF_DIRTY_JOB_CPU_BOUND},
- {"open", 1, nif_open, ERL_NIF_DIRTY_JOB_CPU_BOUND},
- {"decode_tree", 1, nif_decode_tree, ERL_NIF_DIRTY_JOB_CPU_BOUND},
- {"decode_tree", 2, nif_decode_tree, ERL_NIF_DIRTY_JOB_CPU_BOUND}
-};
-
-ERL_NIF_INIT(Elixir.Myhtmlex.Nif, funcs, &load, &reload, &upgrade, &unload)
-
diff --git a/c_src/myhtmlex.h b/c_src/myhtmlex.h
deleted file mode 100644
index b11c2cd..0000000
--- a/c_src/myhtmlex.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef MYHTMLEX_H
-#define MYHTMLEX_H
-
-#include <stdlib.h>
-#include <ctype.h>
-#include <string.h>
-#include "erl_nif.h"
-#include <myhtml/myhtml.h>
-#include <myhtml/mynamespace.h>
-
-char*
-lowercase(char* c);
-// myhtmlex.c
-ERL_NIF_TERM
-make_atom(ErlNifEnv* env, const char* name);
-ERL_NIF_TERM
-nif_decode(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
-ERL_NIF_TERM
-nif_decode_tree(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
-ERL_NIF_TERM
-nif_open(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]);
-ERL_NIF_TERM
-build_node_attrs(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node);
-ERL_NIF_TERM
-build_tree(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node, unsigned char* flags);
-ERL_NIF_TERM
-build_node_children(ErlNifEnv* env, myhtml_tree_t* tree, myhtml_tree_node_t* node, unsigned char* flags);
-void
-nif_cleanup_myhtml_tree(ErlNifEnv* env, void* obj);
-unsigned char
-read_parse_flags(ErlNifEnv* env, const ERL_NIF_TERM* options);
-
-// consts
-ERL_NIF_TERM ATOM_NIL;
-ERL_NIF_TERM ATOM_COMMENT;
-ERL_NIF_TERM ATOM_HTML_ATOMS;
-ERL_NIF_TERM ATOM_NIL_SELF_CLOSING;
-ERL_NIF_TERM ATOM_COMMENT_TUPLE3;
-ERL_NIF_TERM EMPTY_LIST;
-const unsigned char FLAG_HTML_ATOMS = 1 << 0;
-const unsigned char FLAG_NIL_SELF_CLOSING = 1 << 1;
-const unsigned char FLAG_COMMENT_TUPLE3 = 1 << 2;
-
-typedef struct {
- myhtml_t* myhtml;
- myhtml_tree_t* tree;
-
- ErlNifResourceType* myhtml_tree_rt;
-} myhtmlex_state_t;
-
-typedef struct {
- myhtml_tree_t* tree;
- myhtml_tree_node_t *root;
-} myhtmlex_ref_t;
-
-#endif // included myhtmlex.h
diff --git a/lib/myhtmlex.ex b/lib/myhtmlex.ex
index d612716..a6b0637 100644
--- a/lib/myhtmlex.ex
+++ b/lib/myhtmlex.ex
@@ -1,176 +1,105 @@
defmodule Myhtmlex do
@moduledoc """
A module to decode html into a tree structure.
Based on [Alexander Borisov's myhtml](https://github.com/lexborisov/myhtml),
this binding gains the properties of being html-spec compliant and very fast.
## Example
iex> Myhtmlex.decode("<h1>Hello world</h1>")
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}
- Benchmark results (Nif calling mode) on various file sizes on a 2,5Ghz Core i7:
+ Benchmark results (removed Nif calling mode) on various file sizes on a 2,5Ghz Core i7:
Settings:
duration: 1.0 s
## FileSizesBench
[15:28:42] 1/3: github_trending_js.html 341k
[15:28:46] 2/3: w3c_html5.html 131k
[15:28:48] 3/3: wikipedia_hyperlink.html 97k
Finished in 7.52 seconds
## FileSizesBench
benchmark name iterations average time
wikipedia_hyperlink.html 97k 1000 1385.86 µs/op
w3c_html5.html 131k 1000 2179.30 µs/op
github_trending_js.html 341k 500 5686.21 µs/op
-
- ## Configuration
-
- The module you are calling into is always `Myhtmlex` and depending on your application configuration,
- it chooses between the underlying implementations `Myhtmlex.Safe` (default) and `Myhtmlex.Nif`.
-
- Erlang interoperability is a tricky mine-field.
- You can call into C directly using native implemented functions (Nif). But this comes with the risk,
- that if anything goes wrong within the C implementation, your whole VM will crash.
- No more supervisor cushions for here on, just violent crashes.
-
- That is why the default mode of operation keeps your VM safe and happy.
- If you need ultimate parsing speed, or you can simply tolerate VM-level crashes, read on.
-
- ### Call into C-Node (default)
-
- This is the default mode of operation.
- If your application cannot tolerate VM-level crashes, this option allows you to gain the best of both worlds.
- The added overhead is client/server communications, and a worker OS-process that runs next to your VM under VM supervision.
-
- You do not have to do anything to start the worker process, everything is taken care of within the library.
- If you are not running in distributed mode, your VM will automatically be assigned a `sname`.
-
- The worker OS-process stays alive as long as it is under VM-supervision. If your VM goes down, the OS-process will die by itself.
- If the worker OS-process dies for some reason, your VM stays unaffected and will attempt to restart it seamlessly.
-
- ### Call into Nif
-
- If your application is aiming for ultimate parsing speed, and in the worst case can tolerate VM-level crashes, you can call directly into the Nif.
-
- 1. Require myhtmlex without runtime
-
- in your `mix.exs`
-
- def deps do
- [
- {:myhtmlex, ">= 0.0.0", runtime: false}
- ]
- end
-
- 2. Configure the mode to `Myhtmlex.Nif`
-
- e.g. in `config/config.exs`
-
- config :myhtmlex, mode: Myhtmlex.Nif
-
- 3. Bonus: You can [open up in-memory references to parsed trees](https://hexdocs.pm/myhtmlex/Myhtmlex.html#open/1), without parsing + mapping erlang terms in one go
"""
@type tag() :: String.t() | atom()
@type attr() :: {String.t(), String.t()}
@type attr_list() :: [] | [attr()]
@type comment_node() :: {:comment, String.t()}
@type comment_node3() :: {:comment, [], String.t()}
@type tree() ::
{tag(), attr_list(), tree()}
| {tag(), attr_list(), nil}
| comment_node()
| comment_node3()
@type format_flag() :: :html_atoms | :nil_self_closing | :comment_tuple3
defp module() do
- Application.get_env(:myhtmlex, :mode, Myhtmlex.Nif)
+ Application.get_env(:myhtmlex, :mode, Myhtmlex.Safe)
end
@doc """
Returns a tree representation from the given html string.
## Examples
iex> Myhtmlex.decode("<h1>Hello world</h1>")
{"html", [], [{"head", [], []}, {"body", [], [{"h1", [], ["Hello world"]}]}]}
iex> Myhtmlex.decode("<span class='hello'>Hi there</span>")
{"html", [],
[{"head", [], []},
{"body", [], [{"span", [{"class", "hello"}], ["Hi there"]}]}]}
iex> Myhtmlex.decode("<body><!-- a comment --!></body>")
{"html", [], [{"head", [], []}, {"body", [], [comment: " a comment "]}]}
iex> Myhtmlex.decode("<br>")
{"html", [], [{"head", [], []}, {"body", [], [{"br", [], []}]}]}
"""
@spec decode(String.t()) :: tree()
def decode(bin) do
decode(bin, format: [])
end
@doc """
Returns a tree representation from the given html string.
This variant allows you to pass in one or more of the following format flags:
* `:html_atoms` uses atoms for known html tags (faster), binaries for everything else.
* `:nil_self_closing` uses `nil` to designate self-closing tags and void elements.
For example `<br>` is then being represented like `{"br", [], nil}`.
See http://w3c.github.io/html-reference/syntax.html#void-elements for a full list of void elements.
* `:comment_tuple3` uses 3-tuple elements for comments, instead of the default 2-tuple element.
## Examples
iex> Myhtmlex.decode("<h1>Hello world</h1>", format: [:html_atoms])
{:html, [], [{:head, [], []}, {:body, [], [{:h1, [], ["Hello world"]}]}]}
iex> Myhtmlex.decode("<br>", format: [:nil_self_closing])
{"html", [], [{"head", [], []}, {"body", [], [{"br", [], nil}]}]}
iex> Myhtmlex.decode("<body><!-- a comment --!></body>", format: [:comment_tuple3])
{"html", [], [{"head", [], []}, {"body", [], [{:comment, [], " a comment "}]}]}
iex> html = "<body><!-- a comment --!><unknown /></body>"
iex> Myhtmlex.decode(html, format: [:html_atoms, :nil_self_closing, :comment_tuple3])
{:html, [],
[{:head, [], []},
{:body, [], [{:comment, [], " a comment "}, {"unknown", [], nil}]}]}
"""
@spec decode(String.t(), format: [format_flag()]) :: tree()
def decode(bin, format: flags) do
module().decode(bin, flags)
end
-
- @doc """
- Returns a reference to an internally parsed myhtml_tree_t. (Nif only!)
- """
- @spec open(String.t()) :: reference()
- def open(bin) do
- Myhtmlex.Nif.open(bin)
- end
-
- @doc """
- Returns a tree representation from the given reference. See `decode/1` for example output. (Nif only!)
- """
- @spec decode_tree(reference()) :: tree()
- def decode_tree(ref) do
- Myhtmlex.Nif.decode_tree(ref)
- end
-
- @doc """
- Returns a tree representation from the given reference. See `decode/2` for options and example output. (Nif only!)
- """
- @spec decode_tree(reference(), format: [format_flag()]) :: tree()
- def decode_tree(ref, format: flags) do
- Myhtmlex.Nif.decode_tree(ref, flags)
- end
end
diff --git a/lib/myhtmlex/nif.ex b/lib/myhtmlex/nif.ex
deleted file mode 100644
index f698026..0000000
--- a/lib/myhtmlex/nif.ex
+++ /dev/null
@@ -1,26 +0,0 @@
-defmodule Myhtmlex.Nif do
- @moduledoc false
- @on_load {:init, 0}
-
- app = Mix.Project.config()[:app]
-
- def init do
- path = :filename.join(:code.priv_dir(unquote(app)), 'myhtmlex')
- :ok = :erlang.load_nif(path, 0)
- end
-
- def decode(bin)
- def decode(_), do: exit(:nif_library_not_loaded)
-
- def decode(bin, flags)
- def decode(_, _), do: exit(:nif_library_not_loaded)
-
- def open(bin)
- def open(_), do: exit(:nif_library_not_loaded)
-
- def decode_tree(tree)
- def decode_tree(_), do: exit(:nif_library_not_loaded)
-
- def decode_tree(tree, flags)
- def decode_tree(_, _), do: exit(:nif_library_not_loaded)
-end
diff --git a/mix.exs b/mix.exs
index 56ccdbb..311b65c 100644
--- a/mix.exs
+++ b/mix.exs
@@ -1,124 +1,123 @@
defmodule Myhtmlex.Mixfile do
use Mix.Project
def project do
[
app: :myhtmlex,
version: "0.2.1",
elixir: "~> 1.5",
deps: deps(),
package: package(),
compilers: [:myhtmlex_make] ++ Mix.compilers(),
build_embedded: Mix.env() == :prod,
start_permanent: Mix.env() == :prod,
name: "Myhtmlex",
description: """
A module to decode HTML into a tree,
porting all properties of the underlying
library myhtml, being fast and correct
in regards to the html spec.
""",
docs: docs()
]
end
def package do
[
maintainers: ["Lukas Rieder"],
licenses: ["GNU LGPL"],
links: %{
"Github" => "https://git.pleroma.social/pleroma/myhtmlex",
"Issues" => "https://git.pleroma.social/pleroma/myhtmlex/issues",
"MyHTML" => "https://github.com/lexborisov/myhtml"
},
files: [
"lib",
"c_src",
"priv/.gitignore",
"test",
"Makefile",
"mix.exs",
"README.md",
"LICENSE"
]
]
end
def application do
[
extra_applications: [:logger],
mod: {Myhtmlex.Safe, []},
# used to detect conflicts with other applications named processes
registered: [Myhtmlex.Safe.Cnode, Myhtmlex.Safe.Supervisor],
env: [
mode: Myhtmlex.Safe
]
]
end
defp deps do
[
# documentation helpers
{:ex_doc, ">= 0.0.0", only: :dev},
# benchmarking helpers
{:benchfella, "~> 0.3.0", only: :dev},
# cnode helpers
{:nodex,
git: "https://git.pleroma.social/pleroma/nodex",
ref: "cb6730f943cfc6aad674c92161be23a8411f15d1"}
]
end
defp docs do
[
main: "Myhtmlex"
]
end
end
defmodule Mix.Tasks.Compile.MyhtmlexMake do
@artifacts [
- "priv/myhtmlex.so",
"priv/myhtml_worker"
]
def find_make do
_make_cmd =
System.get_env("MAKE") ||
case :os.type() do
{:unix, :freebsd} -> "gmake"
{:unix, :openbsd} -> "gmake"
{:unix, :netbsd} -> "gmake"
{:unix, :dragonfly} -> "gmake"
_ -> "make"
end
end
def run(_) do
make_cmd = find_make()
if match?({:win32, _}, :os.type()) do
IO.warn("Windows is not yet a target.")
exit(1)
else
{result, _error_code} =
System.cmd(
make_cmd,
@artifacts,
stderr_to_stdout: true,
env: [{"MIX_ENV", to_string(Mix.env())}]
)
IO.binwrite(result)
end
:ok
end
def clean() do
make_cmd = find_make()
{result, _error_code} = System.cmd(make_cmd, ["clean"], stderr_to_stdout: true)
Mix.shell().info(result)
:ok
end
end
diff --git a/test/myhtmlex.nif_test.exs b/test/myhtmlex.nif_test.exs
deleted file mode 100644
index c93126d..0000000
--- a/test/myhtmlex.nif_test.exs
+++ /dev/null
@@ -1,29 +0,0 @@
-defmodule Myhtmlex.NifTest do
- use MyhtmlexSharedTests, module: Myhtmlex.Nif
-
- test "parse a larger file (131K)" do
- html = File.read!("bench/github_trending_js.html")
- ref = Myhtmlex.open(html)
- assert is_reference(ref)
- assert is_tuple(Myhtmlex.decode_tree(ref))
- end
-
- test "open" do
- ref = Myhtmlex.open(~s'<dif class="a"></div><div class="b"></div>')
- assert is_reference(ref)
- end
-
- test "open and decode_tree" do
- ref = Myhtmlex.open(~s'text node')
- assert is_reference(ref)
-
- assert {:html, [],
- [
- {:head, [], []},
- {:body, [],
- [
- "text node"
- ]}
- ]} = Myhtmlex.decode_tree(ref, format: [:html_atoms])
- end
-end

File Metadata

Mime Type
text/x-diff
Expires
Thu, Nov 28, 12:27 PM (1 d, 18 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
41036
Default Alt Text
(31 KB)

Event Timeline