Page Menu
Home
Phorge
Search
Configure Global Search
Log In
Files
F115892
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Award Token
Flag For Later
Size
18 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/bench/cnode/file_sizes_bench.exs b/bench/cnode/file_sizes_bench.exs
index e7501cf..6c4722f 100644
--- a/bench/cnode/file_sizes_bench.exs
+++ b/bench/cnode/file_sizes_bench.exs
@@ -1,31 +1,31 @@
defmodule CnodeFileSizesBench do
use Benchfella
setup_all do
Nodex.Distributed.up
- {:ok, _pid} = Nodex.Cnode.start_link(%{exec_path: "priv/myhtml_worker"}, name: Myhtmlex.Safe)
+ {:ok, _pid} = Nodex.Cnode.start_link(%{exec_path: "priv/myhtml_worker"}, name: Myhtmlex.Safe.Cnode)
contents = {
File.read!("bench/github_trending_js.html"),
File.read!("bench/w3c_html5.html"),
File.read!("bench/wikipedia_hyperlink.html")
}
{:ok, contents}
end
bench "github_trending_js.html 341k" do
{ref, _, _} = bench_context
Myhtmlex.Safe.decode(ref)
end
bench "w3c_html5.html 131k" do
{_, ref, _} = bench_context
Myhtmlex.Safe.decode(ref)
end
bench "wikipedia_hyperlink.html 97k" do
{_, _, ref} = bench_context
Myhtmlex.Safe.decode(ref)
end
end
diff --git a/c_src/myhtml_worker.c b/c_src/myhtml_worker.c
index ff447fc..f78073e 100644
--- a/c_src/myhtml_worker.c
+++ b/c_src/myhtml_worker.c
@@ -1,449 +1,426 @@
#include <stdlib.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <errno.h>
#include <ctype.h>
#include "erl_interface.h"
#include "ei.h"
+#include "tstack.h"
+
#include <myhtml/myhtml.h>
#include <myhtml/mynamespace.h>
#define BUFFER_SIZE 1000
typedef struct _state_t {
int fd;
myhtml_tree_t* tree;
} state_t;
typedef struct _prefab_t {
ETERM* atom_nil;
ETERM* atom_comment;
ETERM* empty_list;
} prefab_t;
void
handle_emsg(state_t* state, ErlMessage* emsg);
void
handle_send(state_t* state, ErlMessage* emsg);
ETERM*
decode(state_t* state, ErlMessage* emsg, ETERM* bin, ETERM* args);
ETERM*
build_tree(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node, unsigned char* parse_flags);
ETERM*
-build_node_children(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* parent, unsigned char* parse_flags);
-ETERM*
build_node_attrs(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node);
ETERM*
err_term(const char* error_atom);
unsigned char
read_parse_flags(ETERM* list);
char*
lowercase(char* c);
const unsigned char FLAG_HTML_ATOMS = 1 << 0;
const unsigned char FLAG_NIL_SELF_CLOSING = 1 << 1;
const unsigned char FLAG_COMMENT_TUPLE3 = 1 << 2;
int main(int argc, char **argv) {
if (argc != 5 || !strcmp(argv[1],"-h") || !strcmp(argv[1],"--help")) {
printf("\nUsage: ./priv/cnode_server <sname> <hostname> <cookie> <tname>\n\n");
printf(" sname the short name you want this c-node to connect as\n");
printf(" hostname the hostname\n");
printf(" cookie the authentication cookie\n");
printf(" tname the target node short name to connect to");
return 0;
}
char *sname = argv[1];
char *hostname = argv[2];
char *cookie = argv[3];
char *tname = argv[4];
char full_name[1024];
- stpcpy(stpcpy(stpcpy(full_name, sname), "@"), hostname);
+ stpncpy(stpncpy(stpncpy(full_name, sname, sizeof(full_name)), "@", sizeof(full_name)), hostname, sizeof(full_name));
char target_node[1024];
- stpcpy(stpcpy(stpcpy(target_node, tname), "@"), hostname);
+ stpncpy(stpncpy(stpncpy(target_node, tname, sizeof(target_node)), "@", sizeof(target_node)), hostname, sizeof(target_node));
struct in_addr addr;
addr.s_addr = htonl(INADDR_ANY);
// fd to erlang node
state_t* state = (state_t*)malloc(sizeof(state_t));
bool looping = true;
int buffer_size = BUFFER_SIZE;
unsigned char* bufferpp = (unsigned char*)malloc(BUFFER_SIZE);
ErlMessage emsg;
// initialize all of Erl_Interface
erl_init(NULL, 0);
// initialize this node
printf("initialising %s\n", full_name); fflush(stdout);
if ( erl_connect_xinit(hostname, sname, full_name, &addr, cookie, 0) == -1 )
erl_err_quit("error erl_connect_init");
// connect to target node
printf("connecting to %s\n", target_node); fflush(stdout);
if ((state->fd = erl_connect(target_node)) < 0)
erl_err_quit("erl_connect");
myhtml_t* myhtml = myhtml_create();
myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
state->tree = myhtml_tree_create();
myhtml_tree_init(state->tree, myhtml);
// signal to stdout that we are ready
printf("%s ready\n", full_name); fflush(stdout);
while (looping)
{
// erl_xreceive_msg adapts the buffer width
switch( erl_xreceive_msg(state->fd, &bufferpp, &buffer_size, &emsg) )
// erl_receive_msg, uses a fixed buffer width
/* switch( erl_receive_msg(state->fd, buffer, BUFFER_SIZE, &emsg) ) */
{
case ERL_TICK:
// ignore
break;
case ERL_ERROR:
// On failure, the function returns ERL_ERROR and sets erl_errno to one of:
//
// EMSGSIZE
// Buffer is too small.
// ENOMEM
// No more memory is available.
// EIO
// I/O error.
//
// TODO: what is the correct reaction?
looping = false;
break;
default:
handle_emsg(state, &emsg);
}
}
}
void
handle_emsg(state_t* state, ErlMessage* emsg)
{
switch(emsg->type)
{
case ERL_REG_SEND:
case ERL_SEND:
handle_send(state, emsg);
break;
case ERL_LINK:
case ERL_UNLINK:
break;
case ERL_EXIT:
break;
}
// its our responsibility to free these pointers
erl_free_compound(emsg->msg);
erl_free_compound(emsg->to);
erl_free_compound(emsg->from);
}
void
handle_send(state_t* state, ErlMessage* emsg)
{
ETERM *decode_pattern = erl_format("{decode, Bin, Args}");
ETERM *response;
if (erl_match(decode_pattern, emsg->msg))
{
ETERM *bin = erl_var_content(decode_pattern, "Bin");
ETERM *args = erl_var_content(decode_pattern, "Args");
response = decode(state, emsg, bin, args);
// free allocated resources
erl_free_term(bin);
erl_free_term(args);
}
else
{
response = err_term("unknown_call");
return;
}
// send response
erl_send(state->fd, emsg->from, response);
// free allocated resources
erl_free_compound(response);
erl_free_term(decode_pattern);
// free the free-list
erl_eterm_release();
return;
}
ETERM*
err_term(const char* error_atom)
{
/* ETERM* tuple2[] = {erl_mk_atom("error"), erl_mk_atom(error_atom)}; */
/* return erl_mk_tuple(tuple2, 2); */
return erl_format("{error, ~w}", erl_mk_atom(error_atom));
}
ETERM*
decode(state_t* state, ErlMessage* emsg, ETERM* bin, ETERM* args)
{
unsigned char parse_flags = 0;
prefab_t prefab;
// prepare reusable prefab terms
prefab.atom_nil = erl_mk_atom("nil");
prefab.atom_comment = erl_mk_atom("comment");
prefab.empty_list = erl_mk_empty_list();
+
if (!ERL_IS_BINARY(bin) || !ERL_IS_LIST(args))
{
return err_term("badarg");
}
// get contents of binary argument
char* binary = (char*)ERL_BIN_PTR(bin);
size_t binary_len = ERL_BIN_SIZE(bin);
// parse tree
mystatus_t status = myhtml_parse(state->tree, MyENCODING_UTF_8, binary, binary_len);
if (status != MyHTML_STATUS_OK)
{
return err_term("myhtml_parse_failed");
}
// read parse flags
parse_flags = read_parse_flags(args);
// build tree
myhtml_tree_node_t *root = myhtml_tree_get_document(state->tree);
return build_tree(&prefab, state->tree, myhtml_node_last_child(root), &parse_flags);
}
unsigned char
read_parse_flags(ETERM* list)
{
unsigned char parse_flags = 0;
ETERM *flag;
for (; !ERL_IS_EMPTY_LIST(list); list = ERL_CONS_TAIL(list)) {
flag = ERL_CONS_HEAD(list);
if (erl_match(erl_format("html_atoms"), flag))
{
parse_flags |= FLAG_HTML_ATOMS;
}
else if (erl_match(erl_format("nil_self_closing"), flag))
{
parse_flags |= FLAG_NIL_SELF_CLOSING;
}
else if (erl_match(erl_format("comment_tuple3"), flag))
{
parse_flags |= FLAG_COMMENT_TUPLE3;
}
}
return parse_flags;
}
-
-ETERM*
-build_tree(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node, unsigned char* parse_flags)
+ETERM* build_tree(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node, unsigned char* parse_flags)
{
ETERM* result;
- myhtml_tag_id_t tag_id = myhtml_node_tag_id(node);
- myhtml_namespace_t tag_ns = myhtml_node_namespace(node);
+ myhtml_tree_node_t* prev_node = NULL;
- if (tag_id == MyHTML_TAG__TEXT)
- {
- size_t text_len;
- const char* node_text = myhtml_node_text(node, &text_len);
- result = erl_mk_binary(node_text, text_len);
- }
- else if (tag_id == MyHTML_TAG__COMMENT)
- {
- size_t comment_len;
- const char* node_comment = myhtml_node_text(node, &comment_len);
- ETERM* comment = erl_mk_binary(node_comment, comment_len);
-
- if (*parse_flags & FLAG_COMMENT_TUPLE3)
- {
- /* ETERM* tuple3[] = {prefab->atom_comment, prefab->empty_list, comment}; */
- /* result = erl_mk_tuple(tuple3, 3); */
- result = erl_format("{comment, [], ~w}", comment);
- }
- else
- {
- /* ETERM* tuple2[] = {prefab->atom_comment, comment}; */
- /* result = erl_mk_tuple(tuple2, 2); */
- result = erl_format("{comment, ~w}", comment);
- }
- }
- else
- {
- ETERM* tag;
- ETERM* attrs;
+ tstack stack;
+ tstack_init(&stack, 30);
+ for(myhtml_tree_node_t* current_node = node;;) {
ETERM* children;
- // get name of tag
- size_t tag_name_len;
- const char *tag_name = myhtml_tag_name_by_id(tree, tag_id, &tag_name_len);
- // get namespace of tag
- size_t tag_ns_len;
- const char *tag_ns_name_ptr = myhtml_namespace_name_by_id(tag_ns, &tag_ns_len);
- char *tag_ns_buffer;
- char buffer [tag_ns_len + tag_name_len + 1];
- char *tag_string = buffer;
- size_t tag_string_len;
-
- if (tag_ns != MyHTML_NAMESPACE_HTML)
- {
- // tag_ns_name_ptr is unmodifyable, copy it in our tag_ns_buffer to make it modifyable.
- tag_ns_buffer = malloc(tag_ns_len);
- strcpy(tag_ns_buffer, tag_ns_name_ptr);
- // lowercase tag buffer (can be removed, just a nice to have)
- tag_ns_buffer = lowercase(tag_ns_buffer);
- // prepend namespace to tag name, e.g. "svg:path"
- stpcpy(stpcpy(stpcpy(tag_string, tag_ns_buffer), ":"), tag_name);
- tag_string_len = tag_ns_len + tag_name_len + 1; // +1 for colon
- }
- else
- {
- stpcpy(tag_string, tag_name);
- tag_string_len = tag_name_len;
+ // If we are going up the tree, get the children from the stack
+ if (prev_node && !(current_node->next == prev_node || current_node->parent == prev_node)) {
+ children = tstack_pop(&stack);
+ // Else, try to go down the tree
+ } else if(current_node->last_child) {
+ tstack_push(&stack, erl_mk_empty_list());
+
+ prev_node = current_node;
+ current_node=current_node->last_child;
+
+ continue;
+ } else {
+ if ((myhtml_node_is_close_self(current_node) || myhtml_node_is_void_element(current_node))
+ && (*parse_flags & FLAG_NIL_SELF_CLOSING)) {
+ children = prefab->atom_nil;
+ } else {
+ children = prefab->empty_list;
+ }
}
- // attributes
- attrs = build_node_attrs(prefab, tree, node);
+ myhtml_tag_id_t tag_id = myhtml_node_tag_id(current_node);
+ myhtml_namespace_t tag_ns = myhtml_node_namespace(current_node);
- // children
- children = build_node_children(prefab, tree, node, parse_flags);
+ if (tag_id == MyHTML_TAG__TEXT)
+ {
+ size_t text_len;
- if (!(*parse_flags & FLAG_HTML_ATOMS) || (tag_id == MyHTML_TAG__UNDEF || tag_id == MyHTML_TAG_LAST_ENTRY || tag_ns != MyHTML_NAMESPACE_HTML))
+ const char* node_text = myhtml_node_text(current_node, &text_len);
+ result = erl_mk_binary(node_text, text_len);
+ }
+ else if (tag_id == MyHTML_TAG__COMMENT)
{
- tag = erl_mk_binary(tag_string, tag_string_len);
- /* ETERM* tuple3[] = {tag, attrs, children}; */
- /* result = erl_mk_tuple(tuple3, 3); */
- result = erl_format("{~w, ~w, ~w}", tag, attrs, children);
+ size_t comment_len;
+ const char* node_comment = myhtml_node_text(current_node, &comment_len);
+
+ // For <!----> myhtml_node_text will return a null pointer, which will make erl_format segfault
+ ETERM* comment = erl_mk_binary(node_comment ? node_comment : "", comment_len);
+
+ if (*parse_flags & FLAG_COMMENT_TUPLE3)
+ {
+ result = erl_format("{comment, [], ~w}", comment);
+ }
+ else
+ {
+ result = erl_format("{comment, ~w}", comment);
+ }
}
else
{
- // tag = erl_mk_atom(tag_string);
- tag = erl_mk_atom(tag_string);
- /* ETERM* tuple3[] = {tag, attrs, children}; */
- /* result = erl_mk_tuple(tuple3, 3); */
- result = erl_format("{~w, ~w, ~w}", tag, attrs, children);
- }
+ ETERM* tag;
+ ETERM* attrs;
+
+ // get name of tag
+ size_t tag_name_len;
+ const char *tag_name = myhtml_tag_name_by_id(tree, tag_id, &tag_name_len);
+ // get namespace of tag
+ size_t tag_ns_len;
+ const char *tag_ns_name_ptr = myhtml_namespace_name_by_id(tag_ns, &tag_ns_len);
+ char buffer [tag_ns_len + tag_name_len + 2];
+ char *tag_string = buffer;
+ size_t tag_string_len;
+
+ if (tag_ns != MyHTML_NAMESPACE_HTML)
+ {
+ // tag_ns_name_ptr is unmodifyable, copy it in our tag_ns_buffer to make it modifyable.
+ // +1 because myhtml uses strlen for length returned, which doesn't include the null-byte
+ // https://github.com/lexborisov/myhtml/blob/0ade0e564a87f46fd21693a7d8c8d1fa09ffb6b6/source/myhtml/mynamespace.c#L80
+ char tag_ns_buffer[tag_ns_len + 1];
+ strncpy(tag_ns_buffer, tag_ns_name_ptr, sizeof(tag_ns_buffer));
+ lowercase(tag_ns_buffer);
+
+ tag_string_len = tag_ns_len + tag_name_len + 1; // +1 for colon
+ snprintf(tag_string, sizeof(buffer), "%s:%s", tag_ns_buffer, tag_name);
+ }
+ else
+ {
+ strncpy(tag_string, tag_name, sizeof(buffer));
+ tag_string_len = tag_name_len;
+ }
+
+ // attributes
+ attrs = build_node_attrs(prefab, tree, current_node);
+
+
+ if (!(*parse_flags & FLAG_HTML_ATOMS) || (tag_id == MyHTML_TAG__UNDEF || tag_id == MyHTML_TAG_LAST_ENTRY || tag_ns != MyHTML_NAMESPACE_HTML))
+ tag = erl_mk_binary(tag_string, tag_string_len);
+ else
+ tag = erl_mk_atom(tag_string);
- // free allocated resources
- if (tag_ns != MyHTML_NAMESPACE_HTML)
- {
- free(tag_ns_buffer);
+ result = erl_format("{~w, ~w, ~w}", tag, attrs, children);
}
- }
-
- return result;
-}
-
-ETERM*
-build_node_children(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* parent, unsigned char* parse_flags)
-{
- if (myhtml_node_is_close_self(parent) && (*parse_flags & FLAG_NIL_SELF_CLOSING))
- {
- /* prefab->atom_nil; */
- return erl_mk_atom("nil");
- }
-
- myhtml_tree_node_t* child = myhtml_node_last_child(parent);
- if (child == NULL)
- {
- if (myhtml_node_is_void_element(parent) && (*parse_flags & FLAG_NIL_SELF_CLOSING))
- {
- /* return prefab->atom_nil; */
- return erl_mk_atom("nil");
+ if (stack.used == 0) {
+ tstack_free(&stack);
+ return result;
+ } else {
+ tstack_push(&stack, erl_cons(result, tstack_pop(&stack)));
+ prev_node = current_node;
+ current_node=current_node->prev ? current_node->prev : current_node->parent;
}
- /* else */
- /* { */
- /* return prefab->empty_list; */
- /* } */
}
-
- ETERM* list = erl_mk_empty_list();
-
- while (child)
- {
- ETERM* node_tuple = build_tree(prefab, tree, child, parse_flags);
- list = erl_cons(node_tuple, list);
-
- // get previous child, building the list from reverse
- child = myhtml_node_prev(child);
- }
-
- return list;
}
ETERM*
build_node_attrs(prefab_t* prefab, myhtml_tree_t* tree, myhtml_tree_node_t* node)
{
myhtml_tree_attr_t* attr = myhtml_node_attribute_last(node);
/* if (attr == NULL) */
/* { */
/* return prefab->empty_list; */
/* } */
ETERM* list = erl_mk_empty_list();
while (attr)
{
ETERM* name;
ETERM* value;
ETERM* attr_tuple;
size_t attr_name_len;
const char *attr_name = myhtml_attribute_key(attr, &attr_name_len);
size_t attr_value_len;
const char *attr_value = myhtml_attribute_value(attr, &attr_value_len);
if (attr_value) {
value = erl_mk_binary(attr_value, attr_value_len);
} else {
value = erl_mk_binary(attr_name, attr_name_len);
}
name = erl_mk_binary(attr_name, attr_name_len);
/* ETERM* tuple2[] = {name, value}; */
/* attr_tuple = erl_mk_tuple(tuple2, 2); */
attr_tuple = erl_format("{~w, ~w}", name, value);
list = erl_cons(attr_tuple, list);
// get prev attribute, building the list from reverse
attr = myhtml_attribute_prev(attr);
}
return list;
}
char*
lowercase(char* c)
{
char* p = c;
while(*p)
{
*p = tolower((unsigned char)*p);
p++;
}
return c;
}
diff --git a/c_src/tstack.h b/c_src/tstack.h
new file mode 100644
index 0000000..48141bc
--- /dev/null
+++ b/c_src/tstack.h
@@ -0,0 +1,39 @@
+#ifndef TSTACK_H
+#define TSTACK_H
+
+#include "ei.h"
+#define GROW_BY 30
+
+typedef struct {
+ ETERM* *data;
+ size_t used;
+ size_t size;
+} tstack;
+
+void tstack_init(tstack *stack, size_t initial_size) {
+ stack->data = (ETERM **) malloc(initial_size * sizeof(ETERM*));
+ stack->used = 0;
+ stack->size = initial_size;
+}
+
+void tstack_free(tstack *stack) {
+ free(stack->data);
+}
+
+void tstack_resize(tstack *stack, size_t new_size) {
+ stack->data = (ETERM **)realloc(stack->data, new_size * sizeof(ETERM*));
+ stack->size = new_size;
+}
+
+void tstack_push(tstack *stack, ETERM* element) {
+ if(stack->used == stack->size) {
+ tstack_resize(stack, stack->size + GROW_BY);
+ }
+ stack->data[stack->used++] = element;
+}
+
+ETERM* tstack_pop(tstack *stack) {
+ return stack->data[--(stack->used)];
+}
+
+#endif
diff --git a/test/myhtmlex.safe_test.exs b/test/myhtmlex.safe_test.exs
index 75ea5a2..0b60e53 100644
--- a/test/myhtmlex.safe_test.exs
+++ b/test/myhtmlex.safe_test.exs
@@ -1,4 +1,8 @@
defmodule Myhtmlex.SafeTest do
use MyhtmlexSharedTests, module: Myhtmlex.Safe
+
+ test "doesn't segfault when <!----> is encountered" do
+ assert {"html", _attrs, _children} = Myhtmlex.decode("<div> <!----> </div>")
+ end
end
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Nov 29, 1:43 PM (1 d, 18 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
41222
Default Alt Text
(18 KB)
Attached To
Mode
R16 fast_html
Attached
Detach File
Event Timeline
Log In to Comment