I admit that this is basically a duplicate question of Use freebase data on local server? but I need more detailed answers than have already been given there
I\
This is what worked for me. It allows you to load all of a Freebase dump in a standard MySQL installation on less than 100GB of disk. The key is understanding the data layout in a dump and then transforming it (optimizing it for space and speed).
Freebase notions you should understand before you attempt to use this (all taken from the documentation):
Some other important Freebase specifics:
[{'id':'/','mid':null}]'/m/0cwtm' is a human);'/m/03lmb2f' of type '/film/performance' is NOT a Topic (I choose to think of these as what Blank Nodes in RDF are although this may not be philosophically accurate), while '/m/04y78wb' of type '/film/director' (among others) is;(see the Python code at the bottom)
TRANSFORM 1 (from shell, split links from namespaces ignoring notable_for and non /lang/en text):
python parse.py freebase.tsv #end up with freebase_links.tsv and freebase_ns.tsv
TRANSFORM 2 (from Python console, split freebase_ns.tsv on freebase_ns_types.tsv, freebase_ns_props.tsv plus 15 others which we ignore for now)
import e
e.split_external_keys( 'freebase_ns.tsv' )
TRANSFORM 3 (from Python console, convert property and destination to mids)
import e
ns = e.get_namespaced_data( 'freebase_ns_types.tsv' )
e.replace_property_and_destination_with_mid( 'freebase_links.tsv', ns ) #produces freebase_links_pdmids.tsv
e.replace_property_with_mid( 'freebase_ns_props.tsv', ns ) #produces freebase_ns_props_pmids.tsv
TRANSFORM 4 (from MySQL console, load freebase_links_mids.tsv, freebase_ns_props_mids.tsv and freebase_ns_types.tsv in DB):
CREATE TABLE links(
source VARCHAR(20),
property VARCHAR(20),
destination VARCHAR(20),
value VARCHAR(1)
) ENGINE=MyISAM CHARACTER SET utf8;
CREATE TABLE ns(
source VARCHAR(20),
property VARCHAR(20),
destination VARCHAR(40),
value VARCHAR(255)
) ENGINE=MyISAM CHARACTER SET utf8;
CREATE TABLE types(
source VARCHAR(20),
property VARCHAR(40),
destination VARCHAR(40),
value VARCHAR(40)
) ENGINE=MyISAM CHARACTER SET utf8;
LOAD DATA LOCAL INFILE "/data/freebase_links_pdmids.tsv" INTO TABLE links FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
LOAD DATA LOCAL INFILE "/data/freebase_ns_props_pmids.tsv" INTO TABLE ns FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
LOAD DATA LOCAL INFILE "/data/freebase_ns_base_plus_types.tsv" INTO TABLE types FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n';
CREATE INDEX links_source ON links (source) USING BTREE;
CREATE INDEX ns_source ON ns (source) USING BTREE;
CREATE INDEX ns_value ON ns (value) USING BTREE;
CREATE INDEX types_source ON types (source) USING BTREE;
CREATE INDEX types_destination_value ON types (destination, value) USING BTREE;
Save this as e.py:
import sys
#returns a dict to be used by mid(...), replace_property_and_destination_with_mid(...) bellow
def get_namespaced_data( file_name ):
f = open( file_name )
result = {}
for line in f:
elements = line[:-1].split('\t')
if len( elements ) < 4:
print 'Skip...'
continue
result[(elements[2], elements[3])] = elements[0]
return result
#runs out of memory
def load_links( file_name ):
f = open( file_name )
result = {}
for line in f:
if len( result ) % 1000000 == 0:
print len(result)
elements = line[:-1].split('\t')
src, prop, dest = elements[0], elements[1], elements[2]
if result.get( src, False ):
if result[ src ].get( prop, False ):
result[ src ][ prop ].append( dest )
else:
result[ src ][ prop ] = [dest]
else:
result[ src ] = dict([( prop, [dest] )])
return result
#same as load_links but for the namespaced data
def load_ns( file_name ):
f = open( file_name )
result = {}
for line in f:
if len( result ) % 1000000 == 0:
print len(result)
elements = line[:-1].split('\t')
src, prop, value = elements[0], elements[1], elements[3]
if result.get( src, False ):
if result[ src ].get( prop, False ):
result[ src ][ prop ].append( value )
else:
result[ src ][ prop ] = [value]
else:
result[ src ] = dict([( prop, [value] )])
return result
def links_in_set( file_name ):
f = open( file_name )
result = set()
for line in f:
elements = line[:-1].split('\t')
result.add( elements[0] )
return result
def mid( key, ns ):
if key == '':
return False
elif key == '/':
key = '/boot/root_namespace'
parts = key.split('/')
if len(parts) == 1: #cover the case of something which doesn't start with '/'
print key
return False
if parts[1] == 'm': #already a mid
return key
namespace = '/'.join(parts[:-1])
key = parts[-1]
return ns.get( (namespace, key), False )
def replace_property_and_destination_with_mid( file_name, ns ):
fn = file_name.split('.')[0]
f = open( file_name )
f_out_mids = open(fn+'_pdmids'+'.tsv', 'w')
def convert_to_mid_if_possible( value ):
m = mid( value, ns )
if m: return m
else: return None
counter = 0
for line in f:
elements = line[:-1].split('\t')
md = convert_to_mid_if_possible(elements[1])
dest = convert_to_mid_if_possible(elements[2])
if md and dest:
elements[1] = md
elements[2] = dest
f_out_mids.write( '\t'.join(elements)+'\n' )
else:
counter += 1
print 'Skipped: ' + str( counter )
def replace_property_with_mid( file_name, ns ):
fn = file_name.split('.')[0]
f = open( file_name )
f_out_mids = open(fn+'_pmids'+'.tsv', 'w')
def convert_to_mid_if_possible( value ):
m = mid( value, ns )
if m: return m
else: return None
for line in f:
elements = line[:-1].split('\t')
md = convert_to_mid_if_possible(elements[1])
if md:
elements[1]=md
f_out_mids.write( '\t'.join(elements)+'\n' )
else:
#print 'Skipping ' + elements[1]
pass
#cPickle
#ns=e.get_namespaced_data('freebase_2.tsv')
#import cPickle
#cPickle.dump( ns, open('ttt.dump','wb'), protocol=2 )
#ns=cPickle.load( open('ttt.dump','rb') )
#fn='/m/0'
#n=fn.split('/')[2]
#dir = n[:-1]
def is_mid( value ):
parts = value.split('/')
if len(parts) == 1: #it doesn't start with '/'
return False
if parts[1] == 'm':
return True
return False
def check_if_property_or_destination_are_mid( file_name ):
f = open( file_name )
for line in f:
elements = line[:-1].split('\t')
#if is_mid( elements[1] ) or is_mid( elements[2] ):
if is_mid( elements[1] ):
print line
#
def split_external_keys( file_name ):
fn = file_name.split('.')[0]
f = open( file_name )
f_out_extkeys = open(fn+'_extkeys' + '.tsv', 'w')
f_out_intkeys = open(fn+'_intkeys' + '.tsv', 'w')
f_out_props = open(fn+'_props' + '.tsv', 'w')
f_out_types = open(fn+'_types' + '.tsv', 'w')
f_out_m = open(fn+'_m' + '.tsv', 'w')
f_out_src = open(fn+'_src' + '.tsv', 'w')
f_out_usr = open(fn+'_usr' + '.tsv', 'w')
f_out_base = open(fn+'_base' + '.tsv', 'w')
f_out_blg = open(fn+'_blg' + '.tsv', 'w')
f_out_bus = open(fn+'_bus' + '.tsv', 'w')
f_out_soft = open(fn+'_soft' + '.tsv', 'w')
f_out_uri = open(fn+'_uri' + '.tsv', 'w')
f_out_quot = open(fn+'_quot' + '.tsv', 'w')
f_out_frb = open(fn+'_frb' + '.tsv', 'w')
f_out_tag = open(fn+'_tag' + '.tsv', 'w')
f_out_guid = open(fn+'_guid' + '.tsv', 'w')
f_out_dtwrld = open(fn+'_dtwrld' + '.tsv', 'w')
for line in f:
elements = line[:-1].split('\t')
parts_2 = elements[2].split('/')
if len(parts_2) == 1: #the blank destination elements - '', plus the root domain ones
if elements[1] == '/type/object/key':
f_out_types.write( line )
else:
f_out_props.write( line )
elif elements[2] == '/lang/en':
f_out_props.write( line )
elif (parts_2[1] == 'wikipedia' or parts_2[1] == 'authority') and len( parts_2 ) > 2:
f_out_extkeys.write( line )
elif parts_2[1] == 'm':
f_out_m.write( line )
elif parts_2[1] == 'en':
f_out_intkeys.write( line )
elif parts_2[1] == 'source' and len( parts_2 ) > 2:
f_out_src.write( line )
elif parts_2[1] == 'user':
f_out_usr.write( line )
elif parts_2[1] == 'base' and len( parts_2 ) > 2:
if elements[1] == '/type/object/key':
f_out_types.write( line )
else:
f_out_base.write( line )
elif parts_2[1] == 'biology' and len( parts_2 ) > 2:
f_out_blg.write( line )
elif parts_2[1] == 'business' and len( parts_2 ) > 2:
f_out_bus.write( line )
elif parts_2[1] == 'soft' and len( parts_2 ) > 2:
f_out_soft.write( line )
elif parts_2[1] == 'uri':
f_out_uri.write( line )
elif parts_2[1] == 'quotationsbook' and len( parts_2 ) > 2:
f_out_quot.write( line )
elif parts_2[1] == 'freebase' and len( parts_2 ) > 2:
f_out_frb.write( line )
elif parts_2[1] == 'tag' and len( parts_2 ) > 2:
f_out_tag.write( line )
elif parts_2[1] == 'guid' and len( parts_2 ) > 2:
f_out_guid.write( line )
elif parts_2[1] == 'dataworld' and len( parts_2 ) > 2:
f_out_dtwrld.write( line )
else:
f_out_types.write( line )
Save this as parse.py:
import sys
def parse_freebase_quadruple_tsv_file( file_name ):
fn = file_name.split('.')[0]
f = open( file_name )
f_out_links = open(fn+'_links'+'.tsv', 'w')
f_out_ns = open(fn+'_ns' +'.tsv', 'w')
for line in f:
elements = line[:-1].split('\t')
if len( elements ) < 4:
print 'Skip...'
continue
#print 'Processing ' + str( elements )
#cases described here http://wiki.freebase.com/wiki/Data_dumps
if elements[1].endswith('/notable_for'): #ignore notable_for, it has JSON in it
continue
elif elements[2] and not elements[3]: #case 1, linked
f_out_links.write( line )
elif not (elements[2].startswith('/lang/') and elements[2] != '/lang/en'): #ignore languages other than English
f_out_ns.write( line )
if len(sys.argv[1:]) == 0:
print 'Pass a list of .tsv filenames'
for file_name in sys.argv[1:]:
parse_freebase_quadruple_tsv_file( file_name )
e.get_namespaced_data( 'freebase_ns_types.tsv' ))And the standard disclaimer here. It has been a few months since I did this. I believe it is mostly correct but I do apologize if my notes missed something. Unfortunately the project I needed it for fell through the cracks but hope this helps someone else. If something isn't clear drop a comment here.