i = 1 #记录本次请求所返回的页数 while hasMore and i < 51 and (not isCaught): #最多返回50页,对每页进行解析,并写入结果文件 source_url = url + str(i) #构建某页的URL data = '' #存储该页的网页数据 goon = True #网络中断标记
##网络不好的情况,试着尝试请求三次 for tryNum in range(maxTryNum): try: # set header http://stackoverflow.com/questions/385262/how-do-i-send-a-custom-header-with-urllib2-in-a-http-request # see this http://stackoverflow.com/questions/1653591/python-urllib2-response-header , so you can see 'gzip' in response header with urllib2.urlopen(URL).info() # and this http://stackoverflow.com/questions/3947120/does-python-urllib2-automatically-uncompress-gzip-data-fetched-from-webpage teach you how to decode gzip # #r = requests.get(source_url, cookies=cookie, headers=headers, timeout=12) send_headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6", "Cache-Control":"no-cache", "Connection":"keep-alive", #"Cookie":"cookies here" "Host":"s.weibo.com", "Pragma":"no-cache", "Referer":"http://s.weibo.com/", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" } req = urllib2.Request(source_url , headers=send_headers) r = urllib2.urlopen(req) if r.info().get('Content-Encoding') == 'gzip': buf = StringIO(r.read()) f = gzip.GzipFile(fileobj=buf) html = f.read() else: html = r.read() data = html #print data break except: if tryNum < (maxTryNum-1): time.sleep(10) else: print 'Internet Connect Error!' self.logger.error('Internet Connect Error!') self.logger.info('filePath: ' + self.save_dir) self.logger.info('url: ' + source_url) self.logger.info('fileNum: ' + str(fileNum)) self.logger.info('page: ' + str(i)) self.flag = False goon = False break if goon: lines = data.splitlines() isCaught = True for line in lines: ## 判断是否有微博内容,出现这一行,则说明没有被认为是机器人 if line.startswith('<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"'): isCaught = False n = line.find('html":"') if n > 0: j = line[n + 7: -12].encode("utf-8").decode('unicode_escape').encode("utf-8").replace("\\", "").decode('utf-8') ## 没有更多结果页面 if (j.find('<div class="search_noresult">') > 0): hasMore = False ## 有结果的页面 else: page = etree.HTML(j) comment_box = page.xpath("//div[@action-type=\"feed_list_item\"]") for each_comment in comment_box: comment_time = each_comment.xpath(".//div[@class=\"feed_from W_textb\"]/a/@title") comment_text_list = each_comment.xpath(".//p[@class='comment_txt']") comment_text = [] for eachtext in comment_text_list: comment_text = 'r'.join(eachtext.xpath(".//text()")) comment_text = comment_text.replace(' ','').replace('\n', '').replace('\r', ' ') comment_text = comment_text.encode('utf8') if (comment_text != 'None' and comment_text not in filter): filter.add(comment_text) content.writerow([comment_time[0], comment_text]) # weibo = page.xpath("//div[@class=\"feed_from W_textb\"]) # for weibo_time, weibo_content in weibo: # weibo_time = weibo_time.xpath("./[@class=\"feed_from W_textb\"]/a/@title") # concept = 'r'.join(weibo_content.xpath("./text()")) # concept = concept.replace(' ','').replace('\n','').replace('\r',' ') # concept = concept.encode('utf8') # if (concept != 'None' and concept not in filter): # filter.add(concept) # content.writerow([weibo_time, concept])
# dls_id = page.xpath("//div[@class=\"feed_from W_textb\"]/a/@title") # for dl_id in dls_id: # # mid = str(dl_id.attrib.get('nick-name')) # dl_id = dl_id.encode('utf8') # if (dl_id != 'None' and dl_id not in mid_filter): # mid_filter.add(dl_id) # content.writerow([dl_id]) # # dls_text = page.xpath("//p[@class='comment_txt']") # for dl_text in dls_text: # concept = 'r'.join(dl_text.xpath("./text()")) # concept = concept.replace(' ','').replace('\n','').replace('\r',' ') # concept = concept.encode('utf8') # if (concept != 'None' and concept not in mid_filter): # mid_filter.add(concept) # content.writerow([concept]) # content.writerow('\n') # if (concept != 'None' and concept not in mid_filter): # mid_filter.add(concept) # content.write(concept) # content.write('\n') # print concept # print "----------------------" break lines = None ## 处理被认为是机器人的情况 if isCaught: print 'Be Caught!' self.logger.error('Be Caught Error!') self.logger.info('filePath: ' + self.save_dir) self.logger.info('url: ' + source_url) self.logger.info('fileNum: ' + str(fileNum)) self.logger.info('page:' + str(i)) data = None self.flag = False break ## 没有更多结果,结束该次请求,跳到下一个请求 if not hasMore: print 'No More Results!' if i == 1: time.sleep(random.randint(55,75)) else: time.sleep(15) data = None break i += 1 ## 设置两个邻近URL请求之间的随机休眠时间,防止Be Caught。目前没有模拟登陆 # sleeptime_one = random.randint(self.interval-30,self.interval-10) # sleeptime_two = random.randint(self.interval+10,self.interval+30) # if i%2 == 0: # sleeptime = sleeptime_two # else: # sleeptime = sleeptime_one # print 'sleeping ' + str(sleeptime) + ' seconds...' # time.sleep(sleeptime) else: break csvFile.close() csvFile = None
while True: ## 接受键盘输入 keyword = raw_input('Enter the keyword(type \'quit\' to exit ):') if keyword == 'quit': sys.exit() startTime = raw_input('Enter the start time(Format:YYYY-mm-dd-HH):') # region = raw_input('Enter the region([BJ]11:1000,[SH]31:1000,[GZ]44:1,[CD]51:1):') savedir = raw_input('Enter the save directory(Like C://data//):') interval = raw_input('Enter the time interval( >30 and deafult:50):')
##实例化收集类,收集指定关键字和起始时间的微博 cd = CollectData(keyword, startTime, savedir, interval) while cd.flag: print cd.timescope logger.info(cd.timescope) url = cd.getURL() cd.download(url) cd.timescope = cd.getTimescope(cd.timescope,1) #改变搜索的时间,到下一个小时 else: cd = None print '-----------------------------------------------------' print '-----------------------------------------------------' else: logger.removeHandler(filehandler) logger = None if __name__ == '__main__': main()
Test creat hello 0 hello 0 hello 0world ! Test creat Test delete:hello 0world ! 567 Test creat Test creat Test delete:JQK 789 Test delete:789 Test delete:567
var events = require("events"); var emitter=events.EventEmitter(); emitter.emit(eventName,data); .addListener(eventName,callback(data)) .on(eventName,callback(data)) .once(eventName,callback(data))
作者的成就:分布式,处理billion even trillion triples ,不用 relational tables and bitmap matrices,build on top of a memory cloud,首先说 随机访问的情况下 用硬盘储存三元组不够优良 ,就算有好的索引也会因为额外的join操作,作为主要的查询中 耗时,in-memory graph exploration instead of join operations for SPARQL processing ,作为对比 之前的系统 孤立使用自己划分的三元组 没有用已有的 造成了很大的中间额外开销,介绍如何降低join 数量 如何降低中间过程的大小,并且就算没有足够好的划分 也有优秀的性能,我们允许大范围的图分析
讲图分割问题 对于自然图 用大度的点 连接点来决定放置未知 以及使用 in out间接id 可以有效减少发送数据交流量
但是对于邻点少的来说用另一种方法更好 同一个点的in和out放在同一个机器上的方法
还用了
本地 谓词索引 sort all (predicate, node-id )
全局的 谓词索引 (predicate, hsubject-list i , object-list i i)
提供基本图操作
LoadNodes(predicate, direction): Return nodes that have an incoming or outgoing edge labeled as predicate. 使用全局谓词索引
LoadNeighborsOnMachine(node, direction, machine i): For a given node, return its incoming or outgoing neighbors that reside on machine i. 返回的是上面的间接id
SelectByPredicate(nid, predicate): From a given partial adjacency list specified by nid, return nodes that are labeled with the given predicate. Figure 4. LoadNodes(l2 , out) finds n2 on machine 1, and n3 on machine 2. LoadNeighborsOnMachine(n0 , in, 1) returns the partial adjacency list’s id in1 , and SelectByP redicate(in1 , l2 ) returns n2 .
Query Processing
先把 查询Q拆分为 q1 q2 q3…qn,然后让它们并行寻找 再最后再和并
单个q的匹配方式
For a triple pattern q, our goal is to find all its matches R(q). Let P de[notes] the predicate in q, V denote the variables in q, and B(V ) denote the binding of V . If V is a free variable (not bound), we also use B(V ) to denote all possible values V can take.
Heuristic 1. We expand a subgraph from its exploration point. We combine two subgraphs by connecting their exploration points.
Property 1. We expand a subgraph or combine two subgraphs through an edge. The two nodes on both ends of the edge are valid exploration points in the new graph.
Theorem 1. For a query graph G(V, E), the DP has time complexity O(n·|V |·|E|) where n is the number of connected subgraphs in G.
Theorem 2. Any acyclic query Q with query graph G is guaranteed to have an exploration plan.
Discussion. There are two cases we have not considered formally: i) G is cyclic, and ii) G contains a join on predicates.
RDF驱动用于储存索引和查询已经有一些年了,尤其是the Jena frame-work by HP Labs 显著收到欢迎,and Oracle also provides RDF support for semantic data integration in life sciences and enterprises [11, 29]. However, with the exception of the VLDB 2007 paper by Abadi et al. [1], none of the prior implementations could demonstrate convincing efficiency, failing to scale up towards large datasets and high load. [1] achieves good performance by grouping triples with the same property name into property tables, mapping these onto a column store, and creating materialized views for frequent joins
大多数可公开访问的RDF系统将RDF三元组映射到关系表(例如,RDFSuite [2, 32], Sesame [8, 28], Jena [23, 46], the C-Store-based RDF engine of [1], and also Oracle’s RDF MATCH implementation [11])。有两种极端的方式做到这一点:
修剪主要基于估计的执行成本。也就是说,优化器为每个生成的计划调用成本模型,并修剪由廉价替代品支配的等价计划。这种修剪机制依赖于订单优化[35]来决定计划是否可以被另一个计划支配。由于优化器可以对所有三重排列使用索引,它可以按任意顺序生成元组,这使得合并连接非常有吸引力。因此,如果计划更昂贵,但产生可以在以后使用的有趣的顺序,则保留计划。注意,排序不仅通过索引扫描创建,而且还通过选择引起的功能依赖性创建,因此顺序优化组件是不重要的[35]。从种子开始,通过连接较小问题的最佳解决方案创建更大的计划。在此过程中,所有属性处理逻辑实现为对可用等价类的推理,而不是单个变量绑定。每个计划将为每个等价类产生至多一个绑定。这既简化了流水线断路器前面的隐式投影,并允许自适应传递连接条件的检测(即a = b ^ b = c⇒a = c)。
为此,我们预先计算数据图中的频繁路径,并为它们保留精确的连接统计信息。这里的频率指的是具有相同标签序列的路径。注意,我们对于链和星使用术语路径,在两种情况下结构是相似的。我们通过谓词p 1,…,p的序列来表征路径P. 。 。 ,在其遍历中看到。使用SPARQL语法,我们定义一个(链)路径P p 1 ,…,p n as P p 1 ,…,p n := select r 1 r n+1 where { (r 1 p 1 r 2 ).(r 2 p 2 r 3 ). . . . (r n p n r n+1 )}
FrequentPath(k) // Computes the k most frequent paths C 1 = {P p |p is a predicate in the database} sort C 1 , keep the k most frequent C = C 1 , i = 1 do C i+1 = ∅ for each p 0 ∈ C i , p predicate in the database if top k of C ∪ C i+1 ∪ {P p 0 p } includes all subpaths of p 0 p C i+1 = C i+1 ∪ {P p 0 p } if top k of C ∪ C i+1 ∪ {P pp 0 } includes all subpaths of pp 0 C i+1 = C i+1 ∪ {P pp 0 } C = C ∪ C i+1 , sort C, keep the k most frequent C i+1 = C i+1 ∩ C, i = i + 1 while C i 6 = ∅ return C
与Apriori设置不同,我们的RDFpath意义上的常见路径不一定包含频繁的子路径。考虑具有两个星形链路群集的图,其中所有端节点分别通过谓词(边缘标记)p 1和p 2连接到它们各自的星形中心。现在考虑在两个星形中心之间具有谓词p 3的单个边。在这种情况下,路径P p 3将不频繁,而路径P p 1,p 3,p 2将是频繁的。因此,我们不能简单地使用Apriori算法。